From ecd5bd2b957d5e1ced91601dce5d829287a19b9d Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 3 Nov 2008 17:11:45 +0000
Subject: [PATCH] Add support of --cpu_bind and --mem_bind to salloc

---
 NEWS                  |   2 +-
 doc/man/man1/salloc.1 | 125 +++++++++++++++++++++++++++++++++++++++++-
 doc/man/man1/sbatch.1 |   2 +-
 src/salloc/opt.c      | 105 ++++++++++++++++++++++++++++-------
 src/salloc/opt.h      |   3 +
 src/salloc/salloc.c   |  12 ++++
 src/sbatch/opt.c      |   4 +-
 7 files changed, 227 insertions(+), 26 deletions(-)

diff --git a/NEWS b/NEWS
index a8e3cef1412..5ec01a20e5f 100644
--- a/NEWS
+++ b/NEWS
@@ -18,7 +18,7 @@ documents those changes that are of interest to users and admins.
     within the job step credential.
  -- Add cpu_bind, cpu_bind_type, mem_bind and mem_bind_type to job allocation
     request and job_details structure in slurmctld. Add support to --cpu_bind
-    and --mem_bind options from sbatch command.
+    and --mem_bind options from salloc and sbatch commands.
 
 * Changes in SLURM 1.4.0-pre3
 =============================
diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1
index c4451cfaa7a..90af181c579 100644
--- a/doc/man/man1/salloc.1
+++ b/doc/man/man1/salloc.1
@@ -1,4 +1,4 @@
-.TH "salloc" "1" "SLURM 1.3" "August 2008" "SLURM Commands"
+.TH "salloc" "1" "SLURM 1.4" "November 2008" "SLURM Commands"
 .SH "NAME"
 .LP 
 salloc \- Obtain a SLURM job allocation (a set of nodes), execute a command, 
@@ -128,6 +128,63 @@ An arbitrary comment.
 Demand a contiguous range of nodes. The default is "yes". Specify
 \-\-contiguous=no if a contiguous range of nodes is not required.
 
+.TP
+\fB\-\-cpu_bind\fR=[{\fIquiet,verbose\fR},]\fItype\fR
+Bind tasks to CPUs. Used only when the task/affinity plugin is enabled.
+The configuration parameter \fBTaskPluginParam\fR may override these options.
+
+The \fBSLURM_CPU_BIND\fR environment variables are set when \fB\-\-cpu_bind\fR \
+is in use.
+
+When using \fB\-\-cpus\-per\-task\fR to run multithreaded tasks, be aware that
+CPU binding is inherited from the parent of the process.  This means that
+the multithreaded task should either specify or clear the CPU binding
+itself to avoid having all threads of the multithreaded task use the same
+mask/CPU as the parent.  Alternatively, fat masks (masks which specify more 
+than one allowed CPU) could be used for the tasks in order to provide
+multiple CPUs for the multithreaded tasks.
+
+Supported options include:
+.PD 1
+.RS
+.TP
+.B q[uiet]
+quietly bind before task runs (default)
+.TP
+.B v[erbose]
+verbosely report binding before task runs
+.TP
+.B no[ne]
+don't bind tasks to CPUs (default)
+.TP
+.B rank
+bind by task rank
+.TP
+.B map_cpu:<list>
+bind by mapping CPU IDs to tasks as specified
+where <list> is <cpuid1>,<cpuid2>,...<cpuidN>.
+CPU IDs are interpreted as decimal values unless they are preceded
+with '0x' in which case they interpreted as hexadecimal values.
+.TP
+.B mask_cpu:<list>
+bind by setting CPU masks on tasks as specified
+where <list> is <mask1>,<mask2>,...<maskN>.
+CPU masks are \fBalways\fR interpreted as hexadecimal values but can be
+preceded with an optional '0x'.
+.TP
+.B sockets
+auto\-generated masks bind to sockets
+.TP
+.B cores
+auto\-generated masks bind to cores
+.TP
+.B threads
+auto\-generated masks bind to threads
+.TP
+.B help
+show this help message
+.RE
+
 .TP 
 \fB\-c\fR, \fB\-\-cpus\-per\-task\fR[=]<\fIncpus\fR>
 Advise the SLURM controller that ensuing job steps will require \fIncpus\fR 
@@ -330,6 +387,59 @@ are allocated to jobs (\fBSelectType=select/linear\fR).
 Also see \fB\-\-mem\-per\-cpu\fR.
 \fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive.
 
+.TP
+\fB\-\-mem_bind\fR=[{\fIquiet,verbose\fR},]\fItype\fR
+Bind tasks to memory. Used only when the task/affinity plugin is enabled 
+and the NUMA memory functions are available.
+\fBNote that the resolution of CPU and memory binding 
+may differ on some architectures.\fR For example, CPU binding may be performed 
+at the level of the cores within a processor while memory binding will 
+be performed at the level of nodes, where the definition of "nodes" 
+may differ from system to system. \fBThe use of any type other than 
+"none" or "local" is not recommended.\fR
+If you want greater control, try running a simple test code with the 
+options "\-\-cpu_bind=verbose,none \-\-mem_bind=verbose,none" to determine 
+the specific configuration.
+
+The \fBSLURM_MEM_BIND\fR environment variables is set when  \fB\-\-mem_bind\fR 
+is in use.
+
+Supported options include:
+.RS
+.TP
+.B q[uiet]
+quietly bind before task runs (default)
+.TP
+.B v[erbose]
+verbosely report binding before task runs
+.TP
+.B no[ne]
+don't bind tasks to memory (default)
+.TP
+.B rank
+bind by task rank (not recommended)
+.TP
+.B local
+Use memory local to the processor in use
+.TP
+.B map_mem:<list>
+bind by mapping a node's memory to tasks as specified
+where <list> is <cpuid1>,<cpuid2>,...<cpuidN>.
+CPU IDs are interpreted as decimal values unless they are preceded
+with '0x' in which case they interpreted as hexadecimal values
+(not recommended)
+.TP
+.B mask_mem:<list>
+bind by setting memory masks on tasks as specified
+where <list> is <mask1>,<mask2>,...<maskN>.
+memory masks are \fBalways\fR interpreted as hexadecimal values.
+Note that masks must be preceded with a '0x' if they don't begin
+with [0-9] so they are seen as numerical values by srun.
+.TP
+.B help
+show this help message
+.RE
+
 .TP
 \fB\-\-mem\-per\-cpu\fR[=]<\fIMB\fR>
 Mimimum memory required per allocated CPU in MegaBytes.
@@ -632,6 +742,9 @@ Same as \fB\-\-bell\fR.
 \fBSALLOC_CONN_TYPE\fR
 Same as \fB\-\-conn\-type\fR.
 .TP
+\fBSALLOC_CPU_BIND\fR
+Same as \fB\-\-cpu_bind\fR.
+.TP
 \fBSALLOC_DEBUG\fR
 Same as \fB\-v\fR or \fB\-\-verbose\fR.
 .TP
@@ -647,6 +760,9 @@ Same as \fB\-I\fR or \fB\-\-immediate\fR.
 \fBSALLOC_JOBID\fR
 Same as \fB\-\-jobid\fR.
 .TP
+\fBSALLOC_MEM_BIND\fR
+Same as \fB\-\-mem_bind\fR.
+.TP
 \fBSALLOC_NETWORK\fR
 Same as \fB\-\-network\fR.
 .TP
@@ -672,7 +788,9 @@ Same as \fB\-W\fR or \fB\-\-wait\fR.
 .PP
 salloc will set the following environment variables in the environment of
 the executed program:
-
+.TP
+\fBSLURM_CPU_BIND\fR
+Set to value of the \-\-cpu_bind\fR option.
 .TP
 \fBSLURM_JOB_ID\fR (and \fBSLURM_JOBID\fR for backwards compatibility)
 The ID of the job allocation.
@@ -691,6 +809,9 @@ List of nodes allocated to the job.
 \fBSLURM_JOB_NUM_NODES\fR (and \fBSLURM_NNODES\fR for backwards compatibility)
 Total number of nodes in the job allocation.
 .TP
+\fBSLURM_MEM_BIND\fR
+Set to value of the \-\-mem_bind\fR option.
+.TP
 \fBSLURM_TASKS_PER_NODE\fR
 Number of tasks to be initiated on each node. Values are
 comma separated and in the same order as SLURM_NODELIST.
diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1
index 6381ffafd8c..2b27b9d1687 100644
--- a/doc/man/man1/sbatch.1
+++ b/doc/man/man1/sbatch.1
@@ -1,4 +1,4 @@
-.TH "sbatch" "1" "SLURM 1.3" "July 2008" "SLURM Commands"
+.TH "sbatch" "1" "SLURM 1.4" "November 2008" "SLURM Commands"
 .SH "NAME"
 .LP 
 sbatch \- Submit a batch script to SLURM.
diff --git a/src/salloc/opt.c b/src/salloc/opt.c
index 85bcd1e4942..f8d698db5f1 100644
--- a/src/salloc/opt.c
+++ b/src/salloc/opt.c
@@ -64,6 +64,7 @@
 #include "src/common/proc_args.h"
 #include "src/common/read_config.h" /* contains getnodename() */
 #include "src/common/slurm_protocol_api.h"
+#include "src/common/slurm_resource_info.h"
 #include "src/common/slurm_rlimits_info.h"
 #include "src/common/uid.h"
 #include "src/common/xmalloc.h"
@@ -77,20 +78,24 @@
 #define OPT_INT         0x01
 #define OPT_STRING      0x02
 #define OPT_DEBUG       0x03
-#define OPT_NODES       0x05
-#define OPT_BOOL        0x06
-#define OPT_CORE        0x07
-#define OPT_CONN_TYPE	0x08
-#define OPT_NO_ROTATE	0x0a
-#define OPT_GEOMETRY	0x0b
-#define OPT_BELL        0x0f
-#define OPT_NO_BELL     0x10
-#define OPT_JOBID       0x11
-#define OPT_EXCLUSIVE   0x12
-#define OPT_OVERCOMMIT  0x13
-#define OPT_ACCTG_FREQ  0x14
+#define OPT_NODES       0x04
+#define OPT_BOOL        0x05
+#define OPT_CORE        0x06
+#define OPT_CONN_TYPE	0x07
+#define OPT_NO_ROTATE	0x08
+#define OPT_GEOMETRY	0x09
+#define OPT_BELL        0x0a
+#define OPT_NO_BELL     0x0b
+#define OPT_JOBID       0x0c
+#define OPT_EXCLUSIVE   0x0d
+#define OPT_OVERCOMMIT  0x0e
+#define OPT_ACCTG_FREQ  0x0f
+#define OPT_CPU_BIND    0x10
+#define OPT_MEM_BIND    0x11
 
 /* generic getopt_long flags, integers and *not* valid characters */
+#define LONG_OPT_CPU_BIND    0x101
+#define LONG_OPT_MEM_BIND    0x102
 #define LONG_OPT_JOBID       0x105
 #define LONG_OPT_TMP         0x106
 #define LONG_OPT_MEM         0x107
@@ -236,7 +241,10 @@ static void _opt_default()
 	opt.ntasks_per_node      = NO_VAL; /* ntask max limits */
 	opt.ntasks_per_socket    = NO_VAL;
 	opt.ntasks_per_core      = NO_VAL;
-	opt.cpu_bind_type = 0;		/* local dummy variable for now */
+	opt.cpu_bind_type = 0;
+	opt.cpu_bind = NULL;
+	opt.mem_bind_type = 0;
+	opt.mem_bind = NULL;
 	opt.time_limit = NO_VAL;
 	opt.time_limit_str = NULL;
 	opt.partition = NULL;
@@ -313,21 +321,23 @@ struct env_vars {
 
 env_vars_t env_vars[] = {
   {"SALLOC_ACCOUNT",       OPT_STRING,     &opt.account,       NULL           },
+  {"SALLOC_ACCTG_FREQ",    OPT_INT,        &opt.acctg_freq,    NULL           },
+  {"SALLOC_BELL",          OPT_BELL,       NULL,               NULL           },
   {"SALLOC_CONN_TYPE",     OPT_CONN_TYPE,  NULL,               NULL           },
+  {"SALLOC_CPU_BIND",      OPT_CPU_BIND,   NULL,               NULL           },
   {"SALLOC_DEBUG",         OPT_DEBUG,      NULL,               NULL           },
+  {"SALLOC_EXCLUSIVE",     OPT_EXCLUSIVE,  NULL,               NULL           },
   {"SALLOC_GEOMETRY",      OPT_GEOMETRY,   NULL,               NULL           },
   {"SALLOC_IMMEDIATE",     OPT_BOOL,       &opt.immediate,     NULL           },
   {"SALLOC_JOBID",         OPT_JOBID,      NULL,               NULL           },
+  {"SALLOC_MEM_BIND",      OPT_MEM_BIND,   NULL,               NULL           },
+  {"SALLOC_NETWORK",       OPT_STRING    , &opt.network,       NULL           },
+  {"SALLOC_NO_BELL",       OPT_NO_BELL,    NULL,               NULL           },
   {"SALLOC_NO_ROTATE",     OPT_NO_ROTATE,  NULL,               NULL           },
+  {"SALLOC_OVERCOMMIT",    OPT_OVERCOMMIT, NULL,               NULL           },
   {"SALLOC_PARTITION",     OPT_STRING,     &opt.partition,     NULL           },
   {"SALLOC_TIMELIMIT",     OPT_STRING,     &opt.time_limit_str,NULL           },
   {"SALLOC_WAIT",          OPT_INT,        &opt.max_wait,      NULL           },
-  {"SALLOC_BELL",          OPT_BELL,       NULL,               NULL           },
-  {"SALLOC_NO_BELL",       OPT_NO_BELL,    NULL,               NULL           },
-  {"SALLOC_EXCLUSIVE",     OPT_EXCLUSIVE,  NULL,               NULL           },
-  {"SALLOC_OVERCOMMIT",    OPT_OVERCOMMIT, NULL,               NULL           },
-  {"SALLOC_ACCTG_FREQ",    OPT_INT,        &opt.acctg_freq,    NULL           },
-  {"SALLOC_NETWORK",       OPT_STRING    , &opt.network,       NULL           },
   {NULL, 0, NULL, NULL}
 };
 
@@ -439,6 +449,16 @@ _process_env_var(env_vars_t *e, const char *val)
 	case OPT_OVERCOMMIT:
 		opt.overcommit = true;
 		break;
+	case OPT_CPU_BIND:
+		if (slurm_verify_cpu_bind(val, &opt.cpu_bind,
+					  &opt.cpu_bind_type))
+			exit(1);
+		break;
+	case OPT_MEM_BIND:
+		if (slurm_verify_mem_bind(val, &opt.mem_bind,
+					  &opt.mem_bind_type))
+			exit(1);
+		break;
 	default:
 		/* do nothing */
 		break;
@@ -542,6 +562,8 @@ void set_options(const int argc, char **argv)
 		{"no-shell",      no_argument,       0, LONG_OPT_NOSHELL},
 		{"get-user-env",  optional_argument, 0, LONG_OPT_GET_USER_ENV},
 		{"network",       required_argument, 0, LONG_OPT_NETWORK},
+		{"cpu_bind",      required_argument, 0, LONG_OPT_CPU_BIND},
+		{"mem_bind",      required_argument, 0, LONG_OPT_MEM_BIND},
 		{NULL,            0,                 0, 0}
 	};
 	char *opt_string = "+a:B:c:C:d:D:F:g:hHIJ:kK:L:m:n:N:Op:P:qR:st:uU:vVw:W:x:";
@@ -918,6 +940,16 @@ void set_options(const int argc, char **argv)
 			xfree(opt.network);
 			opt.network = xstrdup(optarg);
 			break;
+		case LONG_OPT_CPU_BIND:
+			if (slurm_verify_cpu_bind(optarg, &opt.cpu_bind,
+						  &opt.cpu_bind_type))
+				exit(1);
+			break;
+		case LONG_OPT_MEM_BIND:
+			if (slurm_verify_mem_bind(optarg, &opt.mem_bind,
+						  &opt.mem_bind_type))
+				exit(1);
+			break;
 		default:
 			fatal("Unrecognized command line parameter %c",
 			      opt_char);
@@ -1180,6 +1212,30 @@ static bool _opt_verify(void)
 		opt.network = "us,sn_all,bulk_xfer";
 #endif
 
+	if (slurm_verify_cpu_bind(NULL, &opt.cpu_bind,
+				  &opt.cpu_bind_type))
+		exit(1);
+	if (opt.cpu_bind_type && (getenv("SLURM_CPU_BIND") == NULL)) {
+		char tmp[64];
+		slurm_sprint_cpu_bind_type(tmp, opt.cpu_bind_type);
+		if (opt.cpu_bind) {
+			setenvf(NULL, "SLURM_CPU_BIND", "%s:%s", 
+				tmp, opt.cpu_bind);
+		} else {
+			setenvf(NULL, "SLURM_CPU_BIND", "%s", tmp);
+		}
+	}
+	if (opt.mem_bind_type && (getenv("SLURM_MEM_BIND") == NULL)) {
+		char tmp[64];
+		slurm_sprint_mem_bind_type(tmp, opt.mem_bind_type);
+		if (opt.mem_bind) {
+			setenvf(NULL, "SLURM_MEM_BIND", "%s:%s", 
+				tmp, opt.mem_bind);
+		} else {
+			setenvf(NULL, "SLURM_MEM_BIND", "%s", tmp);
+		}
+	}
+
 	return verified;
 }
 
@@ -1361,6 +1417,10 @@ static void _opt_list()
 	info("ntasks-per-socket : %d", opt.ntasks_per_socket);
 	info("ntasks-per-core   : %d", opt.ntasks_per_core);
 	info("plane_size        : %u", opt.plane_size);
+	info("cpu_bind          : %s", 
+	     opt.cpu_bind == NULL ? "default" : opt.cpu_bind);
+	info("mem_bind          : %s",
+	     opt.mem_bind == NULL ? "default" : opt.mem_bind);
 	str = print_commandline(command_argc, command_argv);
 	info("user command   : `%s'", str);
 	xfree(str);
@@ -1387,6 +1447,7 @@ static void _usage(void)
 "              [--bell] [--no-bell] [--kill-command[=signal]]\n"
 "              [--nodefile=file] [--nodelist=hosts] [--exclude=hosts]\n"
 "              [--network=type] [--mem-per-cpu=MB]\n"
+"              [--cpu_bind=...] [--mem_bind=...]\n"
 "              [executable [args...]]\n");
 }
 
@@ -1468,7 +1529,11 @@ static void _help(void)
 	    && strcasecmp(conf->task_plugin, "task/affinity") == 0) {
 		printf(
 "      --hint=                 Bind tasks according to application hints\n"
-"                              (see \"--hint=help\" for options)\n");
+"                              (see \"--hint=help\" for options)\n"
+"      --cpu_bind=             Bind tasks to CPUs\n"
+"                              (see \"--cpu_bind=help\" for options)\n"
+"      --mem_bind=             Bind memory to locality domains (ldom)\n"
+"                              (see \"--mem_bind=help\" for options)\n");
 	}
 	slurm_conf_unlock();
 
diff --git a/src/salloc/opt.h b/src/salloc/opt.h
index 2ca869cc76a..4021fff4868 100644
--- a/src/salloc/opt.h
+++ b/src/salloc/opt.h
@@ -72,6 +72,9 @@ typedef struct salloc_options {
 	int ntasks_per_socket; /* --ntasks-per-socket=n     */
 	int ntasks_per_core;   /* --ntasks-per-core=n	    */
 	cpu_bind_type_t cpu_bind_type; /* --cpu_bind=           */
+	char *cpu_bind;		/* binding map for map/mask_cpu */
+	mem_bind_type_t mem_bind_type; /* --mem_bind=		*/
+	char *mem_bind;		/* binding map for map/mask_mem	*/
 	bool extra_set;		/* true if extra node info explicitly set */
 	int  time_limit;	/* --time,   -t	(int minutes)	*/
 	char *time_limit_str;	/* --time,   -t (string)	*/
diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c
index 1f1f8e7dd79..0142a8fd62f 100644
--- a/src/salloc/salloc.c
+++ b/src/salloc/salloc.c
@@ -342,9 +342,21 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc)
 	desc->group_id = opt.gid;
 	if (opt.dependency)
 		desc->dependency = xstrdup(opt.dependency);
+
+	if (opt.cpu_bind)
+		desc->cpu_bind       = opt.cpu_bind;
+	if (opt.cpu_bind_type)
+		desc->cpu_bind_type  = opt.cpu_bind_type;
+	if (opt.mem_bind)
+		desc->mem_bind       = opt.mem_bind;
+	if (opt.mem_bind_type)
+		desc->mem_bind_type  = opt.mem_bind_type;
+	if (opt.plane_size != NO_VAL)
+		desc->plane_size     = opt.plane_size;
 	desc->task_dist  = opt.distribution;
 	if (opt.plane_size != NO_VAL)
 		desc->plane_size = opt.plane_size;
+
 	if (opt.licenses)
 		desc->licenses = xstrdup(opt.licenses);
 	desc->network = opt.network;
diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c
index 673e7f5d78f..33398f00c60 100644
--- a/src/sbatch/opt.c
+++ b/src/sbatch/opt.c
@@ -1935,8 +1935,8 @@ static bool _opt_verify(void)
 	setenv("SLURM_NETWORK", opt.network, 1);
 #endif
 
-	 if (slurm_verify_cpu_bind(NULL, &opt.cpu_bind,
-				   &opt.cpu_bind_type))
+	if (slurm_verify_cpu_bind(NULL, &opt.cpu_bind,
+				  &opt.cpu_bind_type))
 		exit(1);
 	if (opt.cpu_bind_type && (getenv("SLURM_CPU_BIND") == NULL)) {
 		char tmp[64];
-- 
GitLab