From ecd5bd2b957d5e1ced91601dce5d829287a19b9d Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 3 Nov 2008 17:11:45 +0000 Subject: [PATCH] Add support of --cpu_bind and --mem_bind to salloc --- NEWS | 2 +- doc/man/man1/salloc.1 | 125 +++++++++++++++++++++++++++++++++++++++++- doc/man/man1/sbatch.1 | 2 +- src/salloc/opt.c | 105 ++++++++++++++++++++++++++++------- src/salloc/opt.h | 3 + src/salloc/salloc.c | 12 ++++ src/sbatch/opt.c | 4 +- 7 files changed, 227 insertions(+), 26 deletions(-) diff --git a/NEWS b/NEWS index a8e3cef1412..5ec01a20e5f 100644 --- a/NEWS +++ b/NEWS @@ -18,7 +18,7 @@ documents those changes that are of interest to users and admins. within the job step credential. -- Add cpu_bind, cpu_bind_type, mem_bind and mem_bind_type to job allocation request and job_details structure in slurmctld. Add support to --cpu_bind - and --mem_bind options from sbatch command. + and --mem_bind options from salloc and sbatch commands. * Changes in SLURM 1.4.0-pre3 ============================= diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index c4451cfaa7a..90af181c579 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -1,4 +1,4 @@ -.TH "salloc" "1" "SLURM 1.3" "August 2008" "SLURM Commands" +.TH "salloc" "1" "SLURM 1.4" "November 2008" "SLURM Commands" .SH "NAME" .LP salloc \- Obtain a SLURM job allocation (a set of nodes), execute a command, @@ -128,6 +128,63 @@ An arbitrary comment. Demand a contiguous range of nodes. The default is "yes". Specify \-\-contiguous=no if a contiguous range of nodes is not required. +.TP +\fB\-\-cpu_bind\fR=[{\fIquiet,verbose\fR},]\fItype\fR +Bind tasks to CPUs. Used only when the task/affinity plugin is enabled. +The configuration parameter \fBTaskPluginParam\fR may override these options. + +The \fBSLURM_CPU_BIND\fR environment variables are set when \fB\-\-cpu_bind\fR \ +is in use. + +When using \fB\-\-cpus\-per\-task\fR to run multithreaded tasks, be aware that +CPU binding is inherited from the parent of the process. This means that +the multithreaded task should either specify or clear the CPU binding +itself to avoid having all threads of the multithreaded task use the same +mask/CPU as the parent. Alternatively, fat masks (masks which specify more +than one allowed CPU) could be used for the tasks in order to provide +multiple CPUs for the multithreaded tasks. + +Supported options include: +.PD 1 +.RS +.TP +.B q[uiet] +quietly bind before task runs (default) +.TP +.B v[erbose] +verbosely report binding before task runs +.TP +.B no[ne] +don't bind tasks to CPUs (default) +.TP +.B rank +bind by task rank +.TP +.B map_cpu:<list> +bind by mapping CPU IDs to tasks as specified +where <list> is <cpuid1>,<cpuid2>,...<cpuidN>. +CPU IDs are interpreted as decimal values unless they are preceded +with '0x' in which case they interpreted as hexadecimal values. +.TP +.B mask_cpu:<list> +bind by setting CPU masks on tasks as specified +where <list> is <mask1>,<mask2>,...<maskN>. +CPU masks are \fBalways\fR interpreted as hexadecimal values but can be +preceded with an optional '0x'. +.TP +.B sockets +auto\-generated masks bind to sockets +.TP +.B cores +auto\-generated masks bind to cores +.TP +.B threads +auto\-generated masks bind to threads +.TP +.B help +show this help message +.RE + .TP \fB\-c\fR, \fB\-\-cpus\-per\-task\fR[=]<\fIncpus\fR> Advise the SLURM controller that ensuing job steps will require \fIncpus\fR @@ -330,6 +387,59 @@ are allocated to jobs (\fBSelectType=select/linear\fR). Also see \fB\-\-mem\-per\-cpu\fR. \fB\-\-mem\fR and \fB\-\-mem\-per\-cpu\fR are mutually exclusive. +.TP +\fB\-\-mem_bind\fR=[{\fIquiet,verbose\fR},]\fItype\fR +Bind tasks to memory. Used only when the task/affinity plugin is enabled +and the NUMA memory functions are available. +\fBNote that the resolution of CPU and memory binding +may differ on some architectures.\fR For example, CPU binding may be performed +at the level of the cores within a processor while memory binding will +be performed at the level of nodes, where the definition of "nodes" +may differ from system to system. \fBThe use of any type other than +"none" or "local" is not recommended.\fR +If you want greater control, try running a simple test code with the +options "\-\-cpu_bind=verbose,none \-\-mem_bind=verbose,none" to determine +the specific configuration. + +The \fBSLURM_MEM_BIND\fR environment variables is set when \fB\-\-mem_bind\fR +is in use. + +Supported options include: +.RS +.TP +.B q[uiet] +quietly bind before task runs (default) +.TP +.B v[erbose] +verbosely report binding before task runs +.TP +.B no[ne] +don't bind tasks to memory (default) +.TP +.B rank +bind by task rank (not recommended) +.TP +.B local +Use memory local to the processor in use +.TP +.B map_mem:<list> +bind by mapping a node's memory to tasks as specified +where <list> is <cpuid1>,<cpuid2>,...<cpuidN>. +CPU IDs are interpreted as decimal values unless they are preceded +with '0x' in which case they interpreted as hexadecimal values +(not recommended) +.TP +.B mask_mem:<list> +bind by setting memory masks on tasks as specified +where <list> is <mask1>,<mask2>,...<maskN>. +memory masks are \fBalways\fR interpreted as hexadecimal values. +Note that masks must be preceded with a '0x' if they don't begin +with [0-9] so they are seen as numerical values by srun. +.TP +.B help +show this help message +.RE + .TP \fB\-\-mem\-per\-cpu\fR[=]<\fIMB\fR> Mimimum memory required per allocated CPU in MegaBytes. @@ -632,6 +742,9 @@ Same as \fB\-\-bell\fR. \fBSALLOC_CONN_TYPE\fR Same as \fB\-\-conn\-type\fR. .TP +\fBSALLOC_CPU_BIND\fR +Same as \fB\-\-cpu_bind\fR. +.TP \fBSALLOC_DEBUG\fR Same as \fB\-v\fR or \fB\-\-verbose\fR. .TP @@ -647,6 +760,9 @@ Same as \fB\-I\fR or \fB\-\-immediate\fR. \fBSALLOC_JOBID\fR Same as \fB\-\-jobid\fR. .TP +\fBSALLOC_MEM_BIND\fR +Same as \fB\-\-mem_bind\fR. +.TP \fBSALLOC_NETWORK\fR Same as \fB\-\-network\fR. .TP @@ -672,7 +788,9 @@ Same as \fB\-W\fR or \fB\-\-wait\fR. .PP salloc will set the following environment variables in the environment of the executed program: - +.TP +\fBSLURM_CPU_BIND\fR +Set to value of the \-\-cpu_bind\fR option. .TP \fBSLURM_JOB_ID\fR (and \fBSLURM_JOBID\fR for backwards compatibility) The ID of the job allocation. @@ -691,6 +809,9 @@ List of nodes allocated to the job. \fBSLURM_JOB_NUM_NODES\fR (and \fBSLURM_NNODES\fR for backwards compatibility) Total number of nodes in the job allocation. .TP +\fBSLURM_MEM_BIND\fR +Set to value of the \-\-mem_bind\fR option. +.TP \fBSLURM_TASKS_PER_NODE\fR Number of tasks to be initiated on each node. Values are comma separated and in the same order as SLURM_NODELIST. diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 6381ffafd8c..2b27b9d1687 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -1,4 +1,4 @@ -.TH "sbatch" "1" "SLURM 1.3" "July 2008" "SLURM Commands" +.TH "sbatch" "1" "SLURM 1.4" "November 2008" "SLURM Commands" .SH "NAME" .LP sbatch \- Submit a batch script to SLURM. diff --git a/src/salloc/opt.c b/src/salloc/opt.c index 85bcd1e4942..f8d698db5f1 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -64,6 +64,7 @@ #include "src/common/proc_args.h" #include "src/common/read_config.h" /* contains getnodename() */ #include "src/common/slurm_protocol_api.h" +#include "src/common/slurm_resource_info.h" #include "src/common/slurm_rlimits_info.h" #include "src/common/uid.h" #include "src/common/xmalloc.h" @@ -77,20 +78,24 @@ #define OPT_INT 0x01 #define OPT_STRING 0x02 #define OPT_DEBUG 0x03 -#define OPT_NODES 0x05 -#define OPT_BOOL 0x06 -#define OPT_CORE 0x07 -#define OPT_CONN_TYPE 0x08 -#define OPT_NO_ROTATE 0x0a -#define OPT_GEOMETRY 0x0b -#define OPT_BELL 0x0f -#define OPT_NO_BELL 0x10 -#define OPT_JOBID 0x11 -#define OPT_EXCLUSIVE 0x12 -#define OPT_OVERCOMMIT 0x13 -#define OPT_ACCTG_FREQ 0x14 +#define OPT_NODES 0x04 +#define OPT_BOOL 0x05 +#define OPT_CORE 0x06 +#define OPT_CONN_TYPE 0x07 +#define OPT_NO_ROTATE 0x08 +#define OPT_GEOMETRY 0x09 +#define OPT_BELL 0x0a +#define OPT_NO_BELL 0x0b +#define OPT_JOBID 0x0c +#define OPT_EXCLUSIVE 0x0d +#define OPT_OVERCOMMIT 0x0e +#define OPT_ACCTG_FREQ 0x0f +#define OPT_CPU_BIND 0x10 +#define OPT_MEM_BIND 0x11 /* generic getopt_long flags, integers and *not* valid characters */ +#define LONG_OPT_CPU_BIND 0x101 +#define LONG_OPT_MEM_BIND 0x102 #define LONG_OPT_JOBID 0x105 #define LONG_OPT_TMP 0x106 #define LONG_OPT_MEM 0x107 @@ -236,7 +241,10 @@ static void _opt_default() opt.ntasks_per_node = NO_VAL; /* ntask max limits */ opt.ntasks_per_socket = NO_VAL; opt.ntasks_per_core = NO_VAL; - opt.cpu_bind_type = 0; /* local dummy variable for now */ + opt.cpu_bind_type = 0; + opt.cpu_bind = NULL; + opt.mem_bind_type = 0; + opt.mem_bind = NULL; opt.time_limit = NO_VAL; opt.time_limit_str = NULL; opt.partition = NULL; @@ -313,21 +321,23 @@ struct env_vars { env_vars_t env_vars[] = { {"SALLOC_ACCOUNT", OPT_STRING, &opt.account, NULL }, + {"SALLOC_ACCTG_FREQ", OPT_INT, &opt.acctg_freq, NULL }, + {"SALLOC_BELL", OPT_BELL, NULL, NULL }, {"SALLOC_CONN_TYPE", OPT_CONN_TYPE, NULL, NULL }, + {"SALLOC_CPU_BIND", OPT_CPU_BIND, NULL, NULL }, {"SALLOC_DEBUG", OPT_DEBUG, NULL, NULL }, + {"SALLOC_EXCLUSIVE", OPT_EXCLUSIVE, NULL, NULL }, {"SALLOC_GEOMETRY", OPT_GEOMETRY, NULL, NULL }, {"SALLOC_IMMEDIATE", OPT_BOOL, &opt.immediate, NULL }, {"SALLOC_JOBID", OPT_JOBID, NULL, NULL }, + {"SALLOC_MEM_BIND", OPT_MEM_BIND, NULL, NULL }, + {"SALLOC_NETWORK", OPT_STRING , &opt.network, NULL }, + {"SALLOC_NO_BELL", OPT_NO_BELL, NULL, NULL }, {"SALLOC_NO_ROTATE", OPT_NO_ROTATE, NULL, NULL }, + {"SALLOC_OVERCOMMIT", OPT_OVERCOMMIT, NULL, NULL }, {"SALLOC_PARTITION", OPT_STRING, &opt.partition, NULL }, {"SALLOC_TIMELIMIT", OPT_STRING, &opt.time_limit_str,NULL }, {"SALLOC_WAIT", OPT_INT, &opt.max_wait, NULL }, - {"SALLOC_BELL", OPT_BELL, NULL, NULL }, - {"SALLOC_NO_BELL", OPT_NO_BELL, NULL, NULL }, - {"SALLOC_EXCLUSIVE", OPT_EXCLUSIVE, NULL, NULL }, - {"SALLOC_OVERCOMMIT", OPT_OVERCOMMIT, NULL, NULL }, - {"SALLOC_ACCTG_FREQ", OPT_INT, &opt.acctg_freq, NULL }, - {"SALLOC_NETWORK", OPT_STRING , &opt.network, NULL }, {NULL, 0, NULL, NULL} }; @@ -439,6 +449,16 @@ _process_env_var(env_vars_t *e, const char *val) case OPT_OVERCOMMIT: opt.overcommit = true; break; + case OPT_CPU_BIND: + if (slurm_verify_cpu_bind(val, &opt.cpu_bind, + &opt.cpu_bind_type)) + exit(1); + break; + case OPT_MEM_BIND: + if (slurm_verify_mem_bind(val, &opt.mem_bind, + &opt.mem_bind_type)) + exit(1); + break; default: /* do nothing */ break; @@ -542,6 +562,8 @@ void set_options(const int argc, char **argv) {"no-shell", no_argument, 0, LONG_OPT_NOSHELL}, {"get-user-env", optional_argument, 0, LONG_OPT_GET_USER_ENV}, {"network", required_argument, 0, LONG_OPT_NETWORK}, + {"cpu_bind", required_argument, 0, LONG_OPT_CPU_BIND}, + {"mem_bind", required_argument, 0, LONG_OPT_MEM_BIND}, {NULL, 0, 0, 0} }; char *opt_string = "+a:B:c:C:d:D:F:g:hHIJ:kK:L:m:n:N:Op:P:qR:st:uU:vVw:W:x:"; @@ -918,6 +940,16 @@ void set_options(const int argc, char **argv) xfree(opt.network); opt.network = xstrdup(optarg); break; + case LONG_OPT_CPU_BIND: + if (slurm_verify_cpu_bind(optarg, &opt.cpu_bind, + &opt.cpu_bind_type)) + exit(1); + break; + case LONG_OPT_MEM_BIND: + if (slurm_verify_mem_bind(optarg, &opt.mem_bind, + &opt.mem_bind_type)) + exit(1); + break; default: fatal("Unrecognized command line parameter %c", opt_char); @@ -1180,6 +1212,30 @@ static bool _opt_verify(void) opt.network = "us,sn_all,bulk_xfer"; #endif + if (slurm_verify_cpu_bind(NULL, &opt.cpu_bind, + &opt.cpu_bind_type)) + exit(1); + if (opt.cpu_bind_type && (getenv("SLURM_CPU_BIND") == NULL)) { + char tmp[64]; + slurm_sprint_cpu_bind_type(tmp, opt.cpu_bind_type); + if (opt.cpu_bind) { + setenvf(NULL, "SLURM_CPU_BIND", "%s:%s", + tmp, opt.cpu_bind); + } else { + setenvf(NULL, "SLURM_CPU_BIND", "%s", tmp); + } + } + if (opt.mem_bind_type && (getenv("SLURM_MEM_BIND") == NULL)) { + char tmp[64]; + slurm_sprint_mem_bind_type(tmp, opt.mem_bind_type); + if (opt.mem_bind) { + setenvf(NULL, "SLURM_MEM_BIND", "%s:%s", + tmp, opt.mem_bind); + } else { + setenvf(NULL, "SLURM_MEM_BIND", "%s", tmp); + } + } + return verified; } @@ -1361,6 +1417,10 @@ static void _opt_list() info("ntasks-per-socket : %d", opt.ntasks_per_socket); info("ntasks-per-core : %d", opt.ntasks_per_core); info("plane_size : %u", opt.plane_size); + info("cpu_bind : %s", + opt.cpu_bind == NULL ? "default" : opt.cpu_bind); + info("mem_bind : %s", + opt.mem_bind == NULL ? "default" : opt.mem_bind); str = print_commandline(command_argc, command_argv); info("user command : `%s'", str); xfree(str); @@ -1387,6 +1447,7 @@ static void _usage(void) " [--bell] [--no-bell] [--kill-command[=signal]]\n" " [--nodefile=file] [--nodelist=hosts] [--exclude=hosts]\n" " [--network=type] [--mem-per-cpu=MB]\n" +" [--cpu_bind=...] [--mem_bind=...]\n" " [executable [args...]]\n"); } @@ -1468,7 +1529,11 @@ static void _help(void) && strcasecmp(conf->task_plugin, "task/affinity") == 0) { printf( " --hint= Bind tasks according to application hints\n" -" (see \"--hint=help\" for options)\n"); +" (see \"--hint=help\" for options)\n" +" --cpu_bind= Bind tasks to CPUs\n" +" (see \"--cpu_bind=help\" for options)\n" +" --mem_bind= Bind memory to locality domains (ldom)\n" +" (see \"--mem_bind=help\" for options)\n"); } slurm_conf_unlock(); diff --git a/src/salloc/opt.h b/src/salloc/opt.h index 2ca869cc76a..4021fff4868 100644 --- a/src/salloc/opt.h +++ b/src/salloc/opt.h @@ -72,6 +72,9 @@ typedef struct salloc_options { int ntasks_per_socket; /* --ntasks-per-socket=n */ int ntasks_per_core; /* --ntasks-per-core=n */ cpu_bind_type_t cpu_bind_type; /* --cpu_bind= */ + char *cpu_bind; /* binding map for map/mask_cpu */ + mem_bind_type_t mem_bind_type; /* --mem_bind= */ + char *mem_bind; /* binding map for map/mask_mem */ bool extra_set; /* true if extra node info explicitly set */ int time_limit; /* --time, -t (int minutes) */ char *time_limit_str; /* --time, -t (string) */ diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 1f1f8e7dd79..0142a8fd62f 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -342,9 +342,21 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc) desc->group_id = opt.gid; if (opt.dependency) desc->dependency = xstrdup(opt.dependency); + + if (opt.cpu_bind) + desc->cpu_bind = opt.cpu_bind; + if (opt.cpu_bind_type) + desc->cpu_bind_type = opt.cpu_bind_type; + if (opt.mem_bind) + desc->mem_bind = opt.mem_bind; + if (opt.mem_bind_type) + desc->mem_bind_type = opt.mem_bind_type; + if (opt.plane_size != NO_VAL) + desc->plane_size = opt.plane_size; desc->task_dist = opt.distribution; if (opt.plane_size != NO_VAL) desc->plane_size = opt.plane_size; + if (opt.licenses) desc->licenses = xstrdup(opt.licenses); desc->network = opt.network; diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 673e7f5d78f..33398f00c60 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -1935,8 +1935,8 @@ static bool _opt_verify(void) setenv("SLURM_NETWORK", opt.network, 1); #endif - if (slurm_verify_cpu_bind(NULL, &opt.cpu_bind, - &opt.cpu_bind_type)) + if (slurm_verify_cpu_bind(NULL, &opt.cpu_bind, + &opt.cpu_bind_type)) exit(1); if (opt.cpu_bind_type && (getenv("SLURM_CPU_BIND") == NULL)) { char tmp[64]; -- GitLab