From 7844a935dabbcb996d78ef745752d5b43b0e7626 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 9 Dec 2005 17:26:01 +0000 Subject: [PATCH] Added support for task affinity for binding tasks to CPUs (Daniel Palermo, HP). --- NEWS | 5 + configure.ac | 4 + doc/man/man1/srun.1 | 64 ++++++ doc/man/man5/slurm.conf.5 | 3 +- etc/slurm.conf.example | 11 +- slurm.spec | 2 + slurm/slurm.h.in | 8 + src/common/env.c | 58 ++++++ src/common/env.h | 3 + src/common/slurm_protocol_defs.c | 1 + src/common/slurm_protocol_defs.h | 2 + src/common/slurm_protocol_pack.c | 4 + src/plugins/task/Makefile.am | 2 +- src/plugins/task/affinity/Makefile.am | 19 ++ src/plugins/task/affinity/affinity.c | 239 ++++++++++++++++++++++ src/plugins/task/affinity/affinity.h | 83 ++++++++ src/plugins/task/affinity/schedutils.c | 132 ++++++++++++ src/plugins/task/affinity/task_affinity.c | 140 +++++++++++++ src/slurmd/slurmstepd/mgr.c | 2 +- src/slurmd/slurmstepd/slurmstepd_job.c | 8 + src/slurmd/slurmstepd/slurmstepd_job.h | 2 + src/slurmd/slurmstepd/task.c | 7 + src/srun/launch.c | 2 + src/srun/opt.c | 138 ++++++++++++- src/srun/opt.h | 2 + 25 files changed, 929 insertions(+), 12 deletions(-) create mode 100644 src/plugins/task/affinity/Makefile.am create mode 100644 src/plugins/task/affinity/affinity.c create mode 100644 src/plugins/task/affinity/affinity.h create mode 100644 src/plugins/task/affinity/schedutils.c create mode 100644 src/plugins/task/affinity/task_affinity.c diff --git a/NEWS b/NEWS index d46a5578ba0..6cbe21a3a4f 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,11 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 0.7.0-pre6 +============================= + -- Added support for task affinity for binding tasks to CPUs (Daniel + Palermo, HP). + * Changes in SLURM 0.7.0-pre5 ============================= -- Enhanced performance and debugging for slurmctld reconfiguration. diff --git a/configure.ac b/configure.ac index 0dfbe96b172..7af066370c5 100644 --- a/configure.ac +++ b/configure.ac @@ -92,6 +92,9 @@ AC_CHECK_DECLS([hstrerror, strsignal, sys_siglist]) AC_CHECK_FUNCS(unsetenv, [have_unsetenv=yes]) AM_CONDITIONAL(HAVE_UNSETENV, test "x$have_unsetenv" = "xyes") +AC_CHECK_FUNCS(sched_setaffinity, [have_sched_setaffinity=yes]) +AM_CONDITIONAL(HAVE_SCHED_SETAFFINITY, test "x$have_sched_setaffinity" = "xyes") + ACX_PTHREAD([], AC_MSG_ERROR([Error: Cannot figure out how to use pthreads!])) # Always define WITH_PTHREADS if we make it this far @@ -239,6 +242,7 @@ AC_CONFIG_FILES([Makefile src/plugins/mpi/lam/Makefile src/plugins/mpi/none/Makefile src/plugins/task/Makefile + src/plugins/task/affinity/Makefile src/plugins/task/none/Makefile doc/Makefile doc/man/Makefile diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index db57087f75d..29707bb2613 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -475,6 +475,58 @@ Request that a specific list of hosts not be included in the resources allocated to this job. The host list will be assumed to be a filename if it contains a "/"character. +.PP +Affinity/Multi-core Options: (when the task/affinity plugin is enabled) +.TP +\fB\-\-cpu_bind\fR=[{\fIquiet,verbose\fR},]\fItype\fR +Bind tasks to CPUs +.RS +.TP +.B q[uiet], +quietly bind before task runs (default) +.TP +.B v[erbose], +verbosely report binding before task runs +.TP +.B no[ne] +don't bind tasks to CPUs (default) +.TP +.B rank +bind by task rank +.TP +.B map_cpu:<list> +bind by mapping CPU IDs to tasks as specified +where <list> is <cpuid1>,<cpuid2>,...<cpuidN>. +CPU IDs are interpreted as decimal values unless they are preceded +with '0x' in which case they interpreted as hexadecimal values. +.TP +.B mask_cpu:<list> +bind by setting CPU masks on tasks as specified +where <list> is <mask1>,<mask2>,...<maskN>. +CPU masks are \fBalways\fR interpreted as hexadecimal values but can be +preceded with an optional '0x'. +.RE + +To have SLURM always report on the selected binding for all srun commands +executed in a shell, you can also enable verbose mode separately from +the command line with: +.PP +.nf + setenv SLURM_CPU_BIND verbose +.fi +.PP +SLURM_CPU_BIND will not propagate into the tasks environment (binding +by default only affects the first srun). To propagate --cpubind to +successive srun commands, first do the following in each task: +.PP +.nf + setenv SLURM_CPU_BIND \\ + ${SLURM_CPU_BIND_VERBOSE},${SLURM_CPU_BIND_TYPE}${SLURM_CPU_BIND_LIST} +.fi +.PP +See the \fBENVIRONMENT VARIABLES\fR section for a more detailed description +of the individual SLURM_CPU_BIND* variables. + .PP The following options support AIX systems, but may be applicable to other systems as well. Since POE is used to launch tasks, these @@ -704,6 +756,9 @@ The location of the SLURM configuration file. .TP \fBSLURM_ACCOUNT\fR \fB\-U, \-\-account\fR=\fIaccount\fR +.TP +\fBSLURM_CPU_BIND\fR +\fB\-U, \-\-cpu_bind\fR=\fItype\fR .TP 20 \fBSLURM_CPUS_PER_TASK\fR \fB\-c, \-\-ncpus\-per\-task\fR=\fIn\fR @@ -776,6 +831,15 @@ Additionally, will set some environment variables in the environment of the executing tasks on the remote compute nodes. These environment variables are: +.TP +\fBSLURM_CPU_BIND_VERBOSE\fR +--cpu_bind verbosity (quiet,verbose). +.TP +\fBSLURM_CPU_BIND_TYPE\fR +--cpu_bind type (none,rank,map_cpu:,mask_cpu:) +.TP +\fBSLURM_CPU_BIND_LIST\fR +--cpu_bind map or mask list (<list of IDs or masks for this node>) .TP 20 \fBSLURM_CPUS_ON_NODE\fR Count of processors available to the job on this node diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 98dd95b7e4e..6c4e7f52c67 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -461,7 +461,8 @@ Identifies the type of task launch plugin, typically used to provide resource management within a node (e.g. pinning tasks to specific processors). Acceptable values include -"task/none" for systems requiring no special handling. +"task/none" for systems requiring no special handling +or "tasks/affinity" to enable the --cpu_bind CPU affinity srun option. The default value is "task/none". The order of task prolog/epilog execution is as follows: .RS diff --git a/etc/slurm.conf.example b/etc/slurm.conf.example index 1db9cc67ffd..23c1ef1f3b5 100644 --- a/etc/slurm.conf.example +++ b/etc/slurm.conf.example @@ -450,20 +450,21 @@ JobAcctType=jobacct/none # # o Define task launch specific parameters # -# "TaskProlog" : Define a program to be executed as root before each +# "TaskProlog" : Define a program to be executed as the user before each # task begins execution. -# "TaskEpilog" : Define a program to be executed as root after each +# "TaskEpilog" : Define a program to be executed as the user after each # task terminates. # "TaskPlugin" : Define a task launch plugin. This may be used to # provide resource management within a node (e.g. pinning -# tasks to specific processors). Permissible values are -# "task/none" : no task launch actions, the default. +# tasks to specific processors). Permissible values are: +# "task/none" : no task launch actions, the default. +# "task/affinity" : CPU affinity support (see "srun --cpu_bind=") # # Example: # # TaskProlog=/usr/local/slurm/etc/task_prolog # default is none # TaskEpilog=/usr/local/slurm/etc/task_epilog # default is none -# TaskPlugin=task/none # default is task/none +# TaskPlugin=task/affinity # default is task/none # diff --git a/slurm.spec b/slurm.spec index ab76e0725df..24faaeb13a6 100644 --- a/slurm.spec +++ b/slurm.spec @@ -159,6 +159,8 @@ touch $LIST if [ -d /etc/init.d ]; then echo "%config(noreplace) /etc/init.d/slurm" >> $LIST fi +test -f $RPM_BUILD_ROOT/%{_libdir}/slurm/task_affinity.so && + echo %{_libdir}/slurm/task_affinity.so >> $LIST # Build file lists for optional plugin packages for plugin in auth_munge auth_authd sched_wiki; do diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 42df944b0ac..2fae1c36f24 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -211,6 +211,14 @@ enum task_dist_states { SLURM_DIST_UNKNOWN /* unknown dist */ }; +typedef enum cpu_bind_type { /* cpu binding type from --cpu_bind=... */ + CPU_BIND_VERBOSE= 0x01, /* =v, */ + CPU_BIND_NONE = 0x02, /* =no */ + CPU_BIND_RANK = 0x04, /* =rank */ + CPU_BIND_MAPCPU = 0x08, /* =map_cpu:<list of CPU IDs> */ + CPU_BIND_MASKCPU= 0x10, /* =mask_cpu:<list of CPU masks> */ +} cpu_bind_type_t; + /* The last entry in node_states must be STATE_END, keep in sync with * node_state_string. values may be ORed with NODE_STATE_FLAGS below. * Node states typically alternate between NODE_STATE_IDLE and diff --git a/src/common/env.c b/src/common/env.c index ea18139fa19..44145291e6d 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -183,6 +183,9 @@ void unsetenvp(char **env, const char *name) { char **ep; + if (env == NULL) + return; + ep = env; while ((ep = _find_name_in_env (ep, name)) && (*ep != NULL)) { char **dp = ep; @@ -260,6 +263,61 @@ int setup_env(env_t *env) } } + if (env->cpu_bind_type) { + unsetenvp(env->env, "SLURM_CPU_BIND"); /* don't propagate SLURM_CPU_BIND */ + int setstat = 0; + if (env->cpu_bind_type & CPU_BIND_VERBOSE) { + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_VERBOSE", "verbose"); + } else { + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_VERBOSE", "quiet"); + } + if (setstat) { + error("Unable to set SLURM_CPU_BIND_VERBOSE"); + rc = SLURM_FAILURE; + } + + setstat = 0; + if (env->cpu_bind_type & CPU_BIND_NONE) { + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "none"); + } else if (env->cpu_bind_type & CPU_BIND_RANK) { + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "rank"); + } else if (env->cpu_bind_type & CPU_BIND_MAPCPU) { + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "map_cpu:"); + } else if (env->cpu_bind_type & CPU_BIND_MASKCPU) { + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "mask_cpu:"); + } else if (env->cpu_bind_type & (~CPU_BIND_VERBOSE)) { + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "unknown"); + } else { + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", ""); + } + if (setstat) { + error("Unable to set SLURM_CPU_BIND_TYPE"); + rc = SLURM_FAILURE; + } + + setstat = 0; + if (env->cpu_bind) { + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_LIST", env->cpu_bind); + } else { + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_LIST", ""); + } + if (setenvf(&env->env, "SLURM_CPU_BIND_LIST", env->cpu_bind)) { + error("Unable to set SLURM_CPU_BIND_LIST"); + rc = SLURM_FAILURE; + } + } else { + unsetenvp(env->env, "SLURM_CPU_BIND"); /* don't propagate SLURM_CPU_BIND */ + /* set SLURM_CPU_BIND_* env vars to defaults */ + int setstat = 0; + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_VERBOSE", "quiet"); + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", ""); + setstat |= setenvf(&env->env, "SLURM_CPU_BIND_LIST", ""); + if (setstat) { + error("Unable to clear SLURM_CPU_BIND_*"); + rc = SLURM_FAILURE; + } + } + if (env->overcommit && (setenvf(&env->env, "SLURM_OVERCOMMIT", "1"))) { error("Unable to set SLURM_OVERCOMMIT environment variable"); diff --git a/src/common/env.h b/src/common/env.h index 109b125a877..0dace0a6a1b 100644 --- a/src/common/env.h +++ b/src/common/env.h @@ -46,6 +46,9 @@ typedef struct env_options { bool cpus_set; /* true if cpus_per_task explicitly set */ enum distribution_t distribution; /* --distribution=, -m dist */ + cpu_bind_type_t + cpu_bind_type; /* --cpu_bind= */ + char *cpu_bind; /* binding map for map/mask_cpu */ bool overcommit; /* --overcommit, -O */ int slurmd_debug; /* --slurmd-debug, -D */ bool labelio; /* --label-output, -l */ diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index c7de886dcbb..7f1103a74fc 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -323,6 +323,7 @@ void slurm_free_launch_tasks_request_msg(launch_tasks_request_msg_t * msg) xfree(msg->env); } xfree(msg->cwd); + xfree(msg->cpu_bind); if (msg->argv) { for (i = 0; i < msg->argc; i++) { xfree(msg->argv[i]); diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 99957aaef6b..516f4587dc7 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -275,6 +275,8 @@ typedef struct launch_tasks_request_msg { char **env; char **argv; char *cwd; + cpu_bind_type_t cpu_bind_type; /* --cpu_bind= */ + char *cpu_bind; /* binding map for map/mask_cpu */ uint16_t resp_port; uint16_t io_port; uint16_t task_flags; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 27557487289..ade9ee703c4 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2496,6 +2496,8 @@ _pack_launch_tasks_request_msg(launch_tasks_request_msg_t * msg, Buf buffer) pack32(msg->tasks_to_launch, buffer); packstr_array(msg->env, msg->envc, buffer); packstr(msg->cwd, buffer); + pack32(msg->cpu_bind_type, buffer); + packstr(msg->cpu_bind, buffer); packstr_array(msg->argv, msg->argc, buffer); pack16(msg->resp_port, buffer); pack16(msg->io_port, buffer); @@ -2537,6 +2539,8 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t ** safe_unpack32(&msg->tasks_to_launch, buffer); safe_unpackstr_array(&msg->env, &msg->envc, buffer); safe_unpackstr_xmalloc(&msg->cwd, &uint16_tmp, buffer); + safe_unpack32(&msg->cpu_bind_type, buffer); + safe_unpackstr_xmalloc(&msg->cpu_bind, &uint16_tmp, buffer); safe_unpackstr_array(&msg->argv, &msg->argc, buffer); safe_unpack16(&msg->resp_port, buffer); safe_unpack16(&msg->io_port, buffer); diff --git a/src/plugins/task/Makefile.am b/src/plugins/task/Makefile.am index 12ad6f77069..922e64385a3 100644 --- a/src/plugins/task/Makefile.am +++ b/src/plugins/task/Makefile.am @@ -1,4 +1,4 @@ # $Id: Makefile.am 4554 2005-03-08 14:25:17Z jking $ # Makefile for task plugins -SUBDIRS = none +SUBDIRS = affinity none diff --git a/src/plugins/task/affinity/Makefile.am b/src/plugins/task/affinity/Makefile.am new file mode 100644 index 00000000000..4a0e9dd393e --- /dev/null +++ b/src/plugins/task/affinity/Makefile.am @@ -0,0 +1,19 @@ +# $Id: Makefile.am,v 1.1 2005/11/04 02:17:53 palermo Exp $ +# Makefile for task/affinity plugin + +AUTOMAKE_OPTIONS = foreign + +PLUGIN_FLAGS = -module -avoid-version --export-dynamic + +INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common + +if HAVE_SCHED_SETAFFINITY +pkglib_LTLIBRARIES = task_affinity.la +task_affinity_la_SOURCES = task_affinity.c affinity.c schedutils.c affinity.h +task_affinity_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) + +else +pkglib_LTLIBRARIES = +EXTRA_task_affinity_la_SOURCES = task_affinity.c affinity.c schedutils.c affinity.h + +endif diff --git a/src/plugins/task/affinity/affinity.c b/src/plugins/task/affinity/affinity.c new file mode 100644 index 00000000000..3e967afaa0e --- /dev/null +++ b/src/plugins/task/affinity/affinity.c @@ -0,0 +1,239 @@ +/*****************************************************************************\ + * src/plugins/task/affinity/affinity.c - task affinity plugin + * $Id: affinity.c,v 1.2 2005/11/04 02:46:51 palermo Exp $ + ***************************************************************************** + * Copyright (C) 2005 Hewlett-Packard Development Company, L.P. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ +#include "affinity.h" + +void slurm_chkaffinity(cpu_set_t *mask, slurmd_job_t *job, int statval) +{ + char bind_type[42]; + char status[42]; + char prefix[42]; + char suffix[42]; + char mstr[1 + CPU_SETSIZE / 4]; + int task_id = job->envtp->procid; + pid_t mypid = job->envtp->task_pid; + + if (!(job->cpu_bind_type & CPU_BIND_VERBOSE)) return; + + status[0] = '\0'; + prefix[0] = '\0'; + suffix[0] = '\0'; + if (statval) { strcpy(status, "FAILED "); } + + if (job->cpu_bind_type & CPU_BIND_NONE) { + strcpy(bind_type, "set to NO"); + strcpy(prefix, "current "); + sprintf(suffix, "is mask 0x"); + } else { + strcpy(prefix, "setting "); + sprintf(suffix, "to mask 0x"); + if (job->cpu_bind_type & CPU_BIND_RANK) { + strcpy(bind_type, "set to RANK"); + } else if (job->cpu_bind_type & CPU_BIND_MAPCPU) { + strcpy(bind_type, "set to MAP_CPU"); + } else if (job->cpu_bind_type & CPU_BIND_MASKCPU) { + strcpy(bind_type, "set to MASK_CPU"); + } else if (job->cpu_bind_type & (~CPU_BIND_VERBOSE)) { + strcpy(bind_type, "set to UNKNOWN"); + } else { + strcpy(bind_type, "not set"); + strcpy(prefix, "current "); + sprintf(suffix, "is mask 0x"); + } + } + + fprintf(stderr, "SLURM_CPU_BIND_TYPE %s, " + "%s%saffinity of task %u pid %u on host %s %s%s\n", + bind_type, + status, + prefix, + task_id, + mypid, + conf->hostname, + suffix, + cpuset_to_str(mask, mstr)); +} + +int get_cpuset(cpu_set_t *mask, slurmd_job_t *job) +{ + int nummasks, maskid, i; + char *curstr, *selstr; + char mstr[1 + CPU_SETSIZE / 4]; + int local_id = job->envtp->localid; + + debug3("get_cpuset (%d) %s\n", job->cpu_bind_type, job->cpu_bind); + CPU_ZERO(mask); + + if (job->cpu_bind_type & CPU_BIND_NONE) { + return true; + } + + if (job->cpu_bind_type & CPU_BIND_RANK) { + CPU_SET(job->envtp->localid % job->cpus, mask); + return true; + } + + if (!job->cpu_bind) + return false; + + nummasks = 1; + maskid = 0; + selstr = NULL; + + /* get number of strings present in cpu_bind */ + curstr = job->cpu_bind; + while (*curstr) { + if (nummasks == local_id+1) { + selstr = curstr; + maskid = local_id; + break; + } + if (*curstr == ',') + nummasks++; + curstr++; + } + + /* if we didn't already find the mask... */ + if (!selstr) { + /* ...select mask string by wrapping task ID into list */ + maskid = local_id % nummasks; + i = maskid; + curstr = job->cpu_bind; + while (*curstr && i) { + if (*curstr == ',') + i--; + curstr++; + } + if (!*curstr) { + return false; + } + selstr = curstr; + } + + /* extract the selected mask from the list */ + i = 0; + curstr = mstr; + while (*selstr && *selstr != ',' && i++ < (CPU_SETSIZE/4)) + *curstr++ = *selstr++; + *curstr = '\0'; + + if (job->cpu_bind_type & CPU_BIND_MASKCPU) { + /* convert mask string into cpu_set_t mask */ + if (str_to_cpuset(mask, mstr) < 0) { + error("str_to_cpuset %s", mstr); + return false; + } + return true; + } + + if (job->cpu_bind_type & CPU_BIND_MAPCPU) { + unsigned int mycpu = 0; + if (strncmp(mstr, "0x", 2) == 0) { + mycpu = strtoul (&(mstr[2]), NULL, 16); + } else { + mycpu = strtoul (mstr, NULL, 10); + } + CPU_SET(mycpu, mask); + return true; + } + + return false; +} + +/* user_older_affinity + * + * NOTE: some confusion in this. + * At first it seems: + * if glibc 2.3.2 then + * call sched_setaffinity(pid,mask) + * else + * call sched_setaffinity(pid,len,mask) + * but then some 2.4 kernels also have the + * 3 arg version - so its a mess. + */ +#if defined __GLIBC__ +#include <gnu/libc-version.h> /* for gnu_get_libc_version */ +#endif +bool use_3arg_affinity() +{ + static bool has_3arg_affinity = true; + static bool already_checked = false; + if (already_checked) { + return has_3arg_affinity; + } +#if defined __GLIBC__ + const char *glibc_vers = gnu_get_libc_version(); + if (glibc_vers != NULL) { + int scnt = 0, major = 0, minor = 0, point = 0; + scnt = sscanf (glibc_vers, "%d.%d.%d", &major, + &minor, &point); + if (scnt == 3) { + if ((major <= 2) && (minor <= 3) && (point <= 2)) { + has_3arg_affinity = false; + } + } + debug3("glibc version: %d.%d.%d (%d)\n", + major, minor, point, has_3arg_affinity); + } +#endif + already_checked = true; + return has_3arg_affinity; +} + +int slurm_setaffinity(pid_t pid, size_t size, const cpu_set_t *mask) +{ + int (*fptr_sched_setaffinity)() = sched_setaffinity; + int rval = 0; + if (use_3arg_affinity()) { + rval = (*fptr_sched_setaffinity)(pid, size, mask); + } else { + rval = (*fptr_sched_setaffinity)(pid, mask); + } + + char mstr[1 + CPU_SETSIZE / 4]; + if (rval) + verbose("sched_setaffinity(%d,%d,0x%s) failed with status %d", + pid, size, cpuset_to_str(mask, mstr), rval); + return (rval); +} + +int slurm_getaffinity(pid_t pid, size_t size, cpu_set_t *mask) +{ + int (*fptr_sched_getaffinity)() = sched_getaffinity; + int rval = 0; + CPU_ZERO(mask); + if (use_3arg_affinity()) { + rval = (*fptr_sched_getaffinity)(pid, size, mask); + } else { + rval = (*fptr_sched_getaffinity)(pid, mask); + } + + char mstr[1 + CPU_SETSIZE / 4]; + if (rval) + verbose("sched_getaffinity(%d,%d,0x%s) failed with status %d", + pid, size, cpuset_to_str(mask, mstr), rval); + + debug3("sched_getaffinity(%d) = 0x%s", pid, cpuset_to_str(mask, mstr)); + return (rval); +} + diff --git a/src/plugins/task/affinity/affinity.h b/src/plugins/task/affinity/affinity.h new file mode 100644 index 00000000000..f87b541aff4 --- /dev/null +++ b/src/plugins/task/affinity/affinity.h @@ -0,0 +1,83 @@ +/*****************************************************************************\ + * src/plugins/task/affinity/affinity.h - task affinity plugin + * $Id: affinity.h,v 1.2 2005/11/04 02:46:51 palermo Exp $ + ***************************************************************************** + * Copyright (C) 2005 Hewlett-Packard Development Company, L.P. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#if HAVE_SYS_TYPES_H +# include <sys/types.h> +#endif + +#if HAVE_SYS_PRCTL_H +# include <sys/prctl.h> +#endif + +#include <sys/wait.h> +#include <sys/stat.h> +#include <sys/param.h> +#include <sys/poll.h> +#include <unistd.h> +#include <pwd.h> +#include <grp.h> +#include <stdio.h> +#include <string.h> +#include <sys/utsname.h> + +#define __USE_GNU +#include <sched.h> /* SMB */ +#undef __USE_GNU + +#if HAVE_STDLIB_H +# include <stdlib.h> +#endif + +#include <slurm/slurm_errno.h> +#include "src/common/slurm_xlator.h" +#include "src/slurmd/slurmd/slurmd.h" +#include "src/slurmd/slurmstepd/slurmstepd_job.h" + +#include "src/common/cbuf.h" +#include "src/common/hostlist.h" +#include "src/common/log.h" +#include "src/common/node_select.h" +#include "src/common/fd.h" +#include "src/common/safeopen.h" +#include "src/common/slurm_jobacct.h" +#include "src/common/switch.h" +#include "src/common/xsignal.h" +#include "src/common/xstring.h" +#include "src/common/xmalloc.h" +#include "src/common/util-net.h" + +/*** from affinity.c ***/ +void slurm_chkaffinity(cpu_set_t *mask, slurmd_job_t *job, int statval); +int get_cpuset(cpu_set_t *mask, slurmd_job_t *job); +bool use_3arg_affinity(); +int slurm_setaffinity(pid_t pid, size_t size, const cpu_set_t *mask); +int slurm_getaffinity(pid_t pid, size_t size, cpu_set_t *mask); + +/*** from schedutils.c ***/ +int str_to_cpuset(cpu_set_t *mask, const char* str); +char * cpuset_to_str(const cpu_set_t *mask, char *str); + diff --git a/src/plugins/task/affinity/schedutils.c b/src/plugins/task/affinity/schedutils.c new file mode 100644 index 00000000000..1471834965e --- /dev/null +++ b/src/plugins/task/affinity/schedutils.c @@ -0,0 +1,132 @@ +/*****************************************************************************\ + * src/plugins/task/affinity/schedutils.c - scheduling utilities + * $Id: schedutils.c,v 1.2 2005/11/04 02:46:51 palermo Exp $ + ***************************************************************************** + * Routines in this file are taken from the taskset utility (schedutils pkg) + * Copyright (C) 2004 Robert Love + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ +#include "affinity.h" + +/* + * taskset.c - taskset + * Command-line utility for setting and retrieving a task's CPU affinity + * + * Robert Love <rml@tech9.net> 25 April 2002 + * + * Linux kernels as of 2.5.8 provide the needed syscalls for + * working with a task's cpu affinity. Currently 2.4 does not + * support these syscalls, but patches are available at: + * + * http://www.kernel.org/pub/linux/kernel/people/rml/cpu-affinity/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, v2, as + * published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Copyright (C) 2004 Robert Love + */ + +static inline int val_to_char(int v) +{ + if (v >= 0 && v < 10) + return '0' + v; + else if (v >= 10 && v < 16) + return ('a' - 10) + v; + else + return -1; +} + +static inline int char_to_val(int c) +{ + int cl; + + cl = tolower(c); + if (c >= '0' && c <= '9') + return c - '0'; + else if (cl >= 'a' && cl <= 'f') + return cl + (10 - 'a'); + else + return -1; +} + +int str_to_cpuset(cpu_set_t *mask, const char* str) +{ + int len = strlen(str); + const char *ptr = str + len - 1; + int base = 0; + + /* skip 0x, it's all hex anyway */ + if (len > 1 && !memcmp(str, "0x", 2L)) + str += 2; + + CPU_ZERO(mask); + while (ptr >= str) { + char val = char_to_val(*ptr); + if (val == (char) -1) + return -1; + if (val & 1) + CPU_SET(base, mask); + if (val & 2) + CPU_SET(base + 1, mask); + if (val & 4) + CPU_SET(base + 2, mask); + if (val & 8) + CPU_SET(base + 3, mask); + len--; + ptr--; + base += 4; + } + + return 0; +} + +char * cpuset_to_str(const cpu_set_t *mask, char *str) +{ + int base; + char *ptr = str; + char *ret = 0; + + for (base = CPU_SETSIZE - 4; base >= 0; base -= 4) { + char val = 0; + if (CPU_ISSET(base, mask)) + val |= 1; + if (CPU_ISSET(base + 1, mask)) + val |= 2; + if (CPU_ISSET(base + 2, mask)) + val |= 4; + if (CPU_ISSET(base + 3, mask)) + val |= 8; + if (!ret && val) + ret = ptr; + *ptr++ = val_to_char(val); + } + *ptr = 0; + return ret ? ret : ptr - 1; +} + diff --git a/src/plugins/task/affinity/task_affinity.c b/src/plugins/task/affinity/task_affinity.c new file mode 100644 index 00000000000..7efa0f45203 --- /dev/null +++ b/src/plugins/task/affinity/task_affinity.c @@ -0,0 +1,140 @@ +/*****************************************************************************\ + * task_affinity.c - Library for task pre-launch and post_termination + * functions for task affinity support + ***************************************************************************** + * Copyright (C) 2005 Hewlett-Packard Development Company, L.P. + * Modified by Hewlett-Packard for task affinity support using task_none.c + * Copyright (C) 2005 The Regents of the University of California and + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * task_none.c Written by Morris Jette <jette1@llnl.gov>. + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include <signal.h> +#include <sys/types.h> + +#include "affinity.h" + +/* + * These variables are required by the generic plugin interface. If they + * are not found in the plugin, the plugin loader will ignore it. + * + * plugin_name - a string giving a human-readable description of the + * plugin. There is no maximum length, but the symbol must refer to + * a valid string. + * + * plugin_type - a string suggesting the type of the plugin or its + * applicability to a particular form of data or method of data handling. + * If the low-level plugin API is used, the contents of this string are + * unimportant and may be anything. SLURM uses the higher-level plugin + * interface which requires this string to be of the form + * + * <application>/<method> + * + * where <application> is a description of the intended application of + * the plugin (e.g., "task" for task control) and <method> is a description + * of how this plugin satisfies that application. SLURM will only load + * a task plugin if the plugin_type string has a prefix of "task/". + * + * plugin_version - an unsigned 32-bit integer giving the version number + * of the plugin. If major and minor revisions are desired, the major + * version number may be multiplied by a suitable magnitude constant such + * as 100 or 1000. Various SLURM versions will likely require a certain + * minimum versions for their plugins as this API matures. + */ +const char plugin_name[] = "task affinity plugin"; +const char plugin_type[] = "task/affinity"; +const uint32_t plugin_version = 100; + +/* + * init() is called when the plugin is loaded, before any other functions + * are called. Put global initialization here. + */ +int init ( void ) +{ + verbose("%s loaded", plugin_name); + use_3arg_affinity(); /* initialize check for older affinity */ + return SLURM_SUCCESS; +} + +/* + * fini() is called when the plugin is removed. Clear any allocated + * storage here. + */ +int fini ( void ) +{ + verbose("%s unloaded", plugin_name); + return SLURM_SUCCESS; +} + +/* + * task_pre_launch() is called prior to exec of application task. + * It is followed by TaskProlog program (from slurm.conf) and + * --task-prolog (from srun command line). + */ +int task_pre_launch ( slurmd_job_t *job ) +{ + FILE *fp; + + debug("affinity task_pre_launch: %u.%u, task %d", + job->jobid, job->stepid, job->envtp->procid); + + /*** CPU binding support ***/ + if (job->cpu_bind_type) { + cpu_set_t new_mask, cur_mask; + pid_t mypid = job->envtp->task_pid; + + int setval = 0; + slurm_getaffinity(mypid, sizeof(cur_mask), &cur_mask); + + if (get_cpuset(&new_mask, job)) { + if (!(job->cpu_bind_type & CPU_BIND_NONE)) { + setval = slurm_setaffinity(mypid, + sizeof(new_mask), &new_mask); + slurm_getaffinity(mypid, + sizeof(cur_mask), &cur_mask); + } + } + slurm_chkaffinity(setval ? &new_mask : &cur_mask, job, setval); + } + + fp = fopen("/tmp/testit", "w"); + fprintf(fp, "hello\n"); + fclose(fp); + return SLURM_SUCCESS; +} + +/* + * task_term() is called after termination of application task. + * It is preceeded by --task-epilog (from srun command line) + * followed by TaskEpilog program (from slurm.conf). + */ +int task_post_term ( slurmd_job_t *job ) +{ + debug("affinity task_post_term: %u.%u, task %d", + job->jobid, job->stepid, job->envtp->procid); + + return SLURM_SUCCESS; +} + diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 3d7aa08d4b7..88e08b67cb9 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -607,7 +607,7 @@ _fork_all_tasks(slurmd_job_t *job) if (_become_user(job) < 0) exit(2); - log_fini(); + /* log_fini(); */ /* note: moved into exec_task() */ xsignal_unblock(slurmstepd_blocked_signals); diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index 78a4f9e6107..7f78446aa94 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -170,6 +170,8 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) job->uid = (uid_t) msg->uid; job->gid = (gid_t) msg->gid; job->cwd = xstrdup(msg->cwd); + job->cpu_bind_type = msg->cpu_bind_type; + job->cpu_bind = xstrdup(msg->cpu_bind); job->env = _array_copy(msg->envc, msg->env); job->eio = eio_handle_create(); @@ -189,6 +191,8 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) job->envtp->procid = -1; job->envtp->localid = -1; job->envtp->nodeid = -1; + job->envtp->cpu_bind_type = 0; + job->envtp->cpu_bind = NULL; memcpy(&resp_addr, cli_addr, sizeof(slurm_addr)); slurm_set_addr(&resp_addr, msg->resp_port, NULL); @@ -262,6 +266,8 @@ job_spawn_create(spawn_task_request_msg_t *msg, slurm_addr *cli_addr) job->envtp->procid = -1; job->envtp->localid = -1; job->envtp->nodeid = -1; + job->envtp->cpu_bind_type = 0; + job->envtp->cpu_bind = NULL; memcpy(&io_addr, cli_addr, sizeof(slurm_addr)); slurm_set_addr(&io_addr, msg->io_port, NULL); @@ -348,6 +354,8 @@ job_batch_job_create(batch_job_launch_msg_t *msg) job->envtp->procid = -1; job->envtp->localid = -1; job->envtp->nodeid = -1; + job->envtp->cpu_bind_type = 0; + job->envtp->cpu_bind = NULL; srun = srun_info_create(NULL, NULL, NULL); diff --git a/src/slurmd/slurmstepd/slurmstepd_job.h b/src/slurmd/slurmstepd/slurmstepd_job.h index 262a8f15d9d..a6ae03bd210 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.h +++ b/src/slurmd/slurmstepd/slurmstepd_job.h @@ -104,6 +104,8 @@ typedef struct slurmd_job { char **env; /* job environment */ char **argv; /* job argument vector */ char *cwd; /* path to current working directory */ + cpu_bind_type_t cpu_bind_type; /* --cpu_bind= */ + char *cpu_bind; /* binding map for map/mask_cpu */ switch_jobinfo_t switch_job; /* switch-specific job information */ uid_t uid; /* user id for job */ gid_t gid; /* group ID for job */ diff --git a/src/slurmd/slurmstepd/task.c b/src/slurmd/slurmstepd/task.c index a6a180fd859..52e231ae2ca 100644 --- a/src/slurmd/slurmstepd/task.c +++ b/src/slurmd/slurmstepd/task.c @@ -250,12 +250,14 @@ exec_task(slurmd_job_t *job, int i, int waitfd) job->cwd); if (chdir("/tmp") < 0) { error("couldn't chdir to /tmp either. dying."); + log_fini(); exit(4); } } if ((!job->spawn_task) && (set_user_limits(job) < 0)) { debug("Unable to set user limits"); + log_fini(); exit(5); } @@ -268,6 +270,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd) */ if ((rc = read (waitfd, &c, sizeof (c))) != 1) { error ("_exec_task read failed, fd = %d, rc=%d: %m", waitfd, rc); + log_fini(); exit(1); } close(waitfd); @@ -284,6 +287,8 @@ exec_task(slurmd_job_t *job, int i, int waitfd) job->envtp->procid = t->gtid; job->envtp->localid = t->id; job->envtp->task_pid = getpid(); + job->envtp->cpu_bind = xstrdup(job->cpu_bind); + job->envtp->cpu_bind_type = job->cpu_bind_type; setup_env(job->envtp); job->env = job->envtp->env; @@ -295,6 +300,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd) job->nodeid, (uint32_t) i, job->nnodes, job->nprocs, job->task[i]->gtid) < 0) { error("Unable to attach to interconnect: %m"); + log_fini(); exit(1); } @@ -326,6 +332,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd) _run_script("user task_prolog", job->task_prolog, job); } + log_fini(); execve(job->argv[0], job->argv, job->env); /* diff --git a/src/srun/launch.c b/src/srun/launch.c index 238c5f5addb..c313311ba0d 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -147,6 +147,8 @@ launch(void *arg) r->switch_job = job->switch_job; r->task_prolog = opt.task_prolog; r->task_epilog = opt.task_epilog; + r->cpu_bind_type = opt.cpu_bind_type; + r->cpu_bind = opt.cpu_bind; r->ofname = fname_remote_string (job->ofname); r->efname = fname_remote_string (job->efname); diff --git a/src/srun/opt.c b/src/srun/opt.c index 0acafd19637..3aefbfb9d43 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -81,6 +81,7 @@ #define OPT_NO_ROTATE 0x0a #define OPT_GEOMETRY 0x0b #define OPT_MPI 0x0c +#define OPT_CPU_BIND 0x0d /* generic getopt_long flags, integers and *not* valid characters */ #define LONG_OPT_HELP 0x100 @@ -107,11 +108,13 @@ #define LONG_OPT_PROLOG 0x117 #define LONG_OPT_EPILOG 0x118 #define LONG_OPT_BEGIN 0x119 -#define LONG_OPT_MAIL_TYPE 0x11a -#define LONG_OPT_MAIL_USER 0x11b +#define LONG_OPT_MAIL_TYPE 0x11a +#define LONG_OPT_MAIL_USER 0x11b #define LONG_OPT_TASK_PROLOG 0x11c #define LONG_OPT_TASK_EPILOG 0x11d #define LONG_OPT_NICE 0x11e +#define LONG_OPT_CPU_BIND 0x11f + /*---- forward declarations of static functions ----*/ @@ -159,6 +162,8 @@ static void _usage(void); static bool _valid_node_list(char **node_list_pptr); static enum task_dist_states _verify_dist_type(const char *arg); static bool _verify_node_count(const char *arg, int *min, int *max); +static int _verify_cpu_bind(const char *arg, char **cpu_bind, + cpu_bind_type_t *cpu_bind_type); static int _verify_geometry(const char *arg, int *geometry); static int _verify_conn_type(const char *arg); @@ -303,6 +308,100 @@ static int _verify_geometry(const char *arg, int *geometry) return rc; } +/* + * verify cpu_bind arguments + * returns -1 on error, 0 otherwise + */ +static int _verify_cpu_bind(const char *arg, char **cpu_bind, cpu_bind_type_t *cpu_bind_type) +{ + char *buf = xstrdup(arg); + char *pos = buf; + /* we support different launch policy names + * we also allow a verbose setting to be specified + * -cpu_bind=v + * -cpu_bind=rank,v + * -cpu_bind=rank + * -cpu_bind={MAP_CPU|MAP_MASK}:0,1,2,3,4 + */ + if (*pos) { + /* parse --cpu_bind command line arguments */ + bool fl_cpubind_verbose = 0; + char *cmd_line_affinity = NULL; + char *cmd_line_mapping = NULL; + char *mappos = strchr(pos,':'); + if (!mappos) { + mappos = strchr(pos,'='); + } + if (strncasecmp(pos, "quiet", 5) == 0) { + fl_cpubind_verbose=0; + pos+=5; + } else if (*pos=='q' || *pos=='Q') { + fl_cpubind_verbose=0; + pos++; + } + if (strncasecmp(pos, "verbose", 7) == 0) { + fl_cpubind_verbose=1; + pos+=7; + } else if (*pos=='v' || *pos=='V') { + fl_cpubind_verbose=1; + pos++; + } + if (*pos==',') { + pos++; + } + if (*pos) { + char *vpos=NULL; + cmd_line_affinity = pos; + if (((vpos=strstr(pos,",q")) !=0 ) || + ((vpos=strstr(pos,",Q")) !=0 )) { + *vpos='\0'; + fl_cpubind_verbose=0; + } + if (((vpos=strstr(pos,",v")) !=0 ) || + ((vpos=strstr(pos,",V")) !=0 )) { + *vpos='\0'; + fl_cpubind_verbose=1; + } + } + if (mappos) { + *mappos='\0'; + mappos++; + cmd_line_mapping=mappos; + } + + /* convert parsed command line args into interface */ + if (cmd_line_mapping) { + xfree(*cpu_bind); + *cpu_bind = xstrdup(cmd_line_mapping); + } + if (fl_cpubind_verbose) { + *cpu_bind_type |= CPU_BIND_VERBOSE; + } + if (cmd_line_affinity) { + *cpu_bind_type &= CPU_BIND_VERBOSE; /* clear any previous type */ + if ((strcasecmp(cmd_line_affinity, "no") == 0) || + (strcasecmp(cmd_line_affinity, "none") == 0)) { + *cpu_bind_type |= CPU_BIND_NONE; + } else if (strcasecmp(cmd_line_affinity, "rank") == 0) { + *cpu_bind_type |= CPU_BIND_RANK; + } else if ((strcasecmp(cmd_line_affinity, "map_cpu") == 0) || + (strcasecmp(cmd_line_affinity, "mapcpu") == 0)) { + *cpu_bind_type |= CPU_BIND_MAPCPU; + } else if ((strcasecmp(cmd_line_affinity, "mask_cpu") == 0) || + (strcasecmp(cmd_line_affinity, "maskcpu") == 0)) { + *cpu_bind_type |= CPU_BIND_MASKCPU; + } else { + error("unrecognized --cpu_bind argument \"%s\"", cmd_line_affinity); + xfree(buf); + return 1; + } + } + } + + xfree(buf); + return 0; +} + /* * verify that a node count in arg is of a known form (count or min-max) * OUT min, max specified minimum and maximum node counts @@ -455,6 +554,8 @@ static void _opt_default() opt.min_nodes = 1; opt.max_nodes = 0; opt.nodes_set = false; + opt.cpu_bind_type = 0; + opt.cpu_bind = NULL; opt.time_limit = -1; opt.partition = NULL; opt.max_threads = MAX_THREADS; @@ -564,6 +665,7 @@ env_vars_t env_vars[] = { {"SLURM_CPUS_PER_TASK", OPT_INT, &opt.cpus_per_task, &opt.cpus_set }, {"SLURM_CONN_TYPE", OPT_CONN_TYPE, NULL, NULL }, {"SLURM_CORE_FORMAT", OPT_CORE, NULL, NULL }, + {"SLURM_CPU_BIND", OPT_CPU_BIND, NULL, NULL }, {"SLURM_DEBUG", OPT_DEBUG, NULL, NULL }, {"SLURM_DISTRIBUTION", OPT_DISTRIB, NULL, NULL }, {"SLURM_GEOMETRY", OPT_GEOMETRY, NULL, NULL }, @@ -647,6 +749,12 @@ _process_env_var(env_vars_t *e, const char *val) opt.distribution = dt; break; + case OPT_CPU_BIND: + if (_verify_cpu_bind(val, &opt.cpu_bind, + &opt.cpu_bind_type)) + exit(1); + break; + case OPT_NODES: opt.nodes_set = _verify_node_count( val, &opt.min_nodes, @@ -764,7 +872,8 @@ void set_options(const int argc, char **argv, int first) {"disable-status", no_argument, 0, 'X'}, {"no-allocate", no_argument, 0, 'Z'}, {"contiguous", no_argument, 0, LONG_OPT_CONT}, - {"exclusive", no_argument, 0, LONG_OPT_EXCLUSIVE}, + {"exclusive", no_argument, 0, LONG_OPT_EXCLUSIVE}, + {"cpu_bind", required_argument, 0, LONG_OPT_CPU_BIND}, {"core", required_argument, 0, LONG_OPT_CORE}, {"mincpus", required_argument, 0, LONG_OPT_MINCPU}, {"mem", required_argument, 0, LONG_OPT_MEM}, @@ -1078,6 +1187,11 @@ void set_options(const int argc, char **argv, int first) case LONG_OPT_EXCLUSIVE: opt.exclusive = true; break; + case LONG_OPT_CPU_BIND: + if (_verify_cpu_bind(optarg, &opt.cpu_bind, + &opt.cpu_bind_type)) + exit(1); + break; case LONG_OPT_CORE: opt.core_type = core_format_type (optarg); if (opt.core_type == CORE_INVALID) @@ -1621,6 +1735,8 @@ static void _opt_list() opt.partition == NULL ? "default" : opt.partition); info("job name : `%s'", opt.job_name); info("distribution : %s", format_task_dist_states(opt.distribution)); + info("cpu_bind : %s", + opt.cpu_bind == NULL ? "default" : opt.cpu_bind); info("core format : %s", core_format_name (opt.core_type)); info("verbose : %d", _verbose); info("slurmd_debug : %d", opt.slurmd_debug); @@ -1692,12 +1808,13 @@ static void _usage(void) " [--core=type] [-T threads] [-W sec] [--attach] [--join] \n" " [--contiguous] [--mincpus=n] [--mem=MB] [--tmp=MB] [-C list]\n" " [--mpi=type] [--account=name] [--dependency=jobid]\n" -" [--kill-on-bad-exit] [--propagate[=rlimits] ]\n" +" [--kill-on-bad-exit] [--propagate[=rlimits] ] [--cpu_bind=...]\n" #ifdef HAVE_BG /* Blue gene specific options */ " [--geometry=XxYxZ] [--conn-type=type] [--no-rotate]\n" #endif " [--mail-type=type] [--mail-user=user][--nice[=value]]\n" " [--prolog=fname] [--epilog=fname]\n" +" [--task-prolog=fname] [--task-epilog=fname]\n" " [-w hosts...] [-x hosts...] executable [args...]\n"); } @@ -1749,6 +1866,8 @@ static void _help(void) " --mpi=type specifies version of MPI to use\n" " --prolog=program run \"program\" before launching job step\n" " --epilog=program run \"program\" after launching job step\n" +" --task-prolog=program run \"program\" before launching task\n" +" --task-epilog=program run \"program\" after launching task\n" " --begin=time defer job until HH:MM DD/MM/YY\n" " --mail-type=type notify on state change: BEGIN, END, FAIL or ALL\n" " --mail-user=user who to send email notification for job state changes\n" @@ -1776,6 +1895,17 @@ static void _help(void) " --exclusive allocate nodes in exclusive mode when\n" " cpu consumable resource is enabled\n" "\n" +"Affinity/Multi-core options: (when the task/affinity plugin is enabled)\n" +" --cpu_bind= Bind tasks to CPUs\n" +" q[uiet], quietly bind before task runs (default)\n" +" v[erbose], verbosely report binding before task runs\n" +" no[ne] don't bind tasks to CPUs (default)\n" +" rank bind by task rank\n" +" map_cpu:<list> bind by mapping CPU IDs to tasks as specified\n" +" where <list> is <cpuid1>,<cpuid2>,...<cpuidN>\n" +" mask_cpu:<list> bind by setting CPU masks on tasks as specified\n" +" where <list> is <mask1>,<mask2>,...<maskN>\n" +"\n" #ifdef HAVE_AIX /* AIX/Federation specific options */ "AIX related options:\n" " --network=type communication protocol to be used\n" diff --git a/src/srun/opt.h b/src/srun/opt.h index 806f32c4b3a..97e87d1e4cb 100644 --- a/src/srun/opt.h +++ b/src/srun/opt.h @@ -95,6 +95,8 @@ typedef struct srun_options { int max_threads; /* --threads, -T (threads in srun) */ int min_nodes; /* --nodes=n, -N n */ int max_nodes; /* --nodes=x-n, -N x-n */ + cpu_bind_type_t cpu_bind_type; /* --cpu_bind= */ + char *cpu_bind; /* binding map for map/mask_cpu */ bool nodes_set; /* true if nodes explicitly set */ int time_limit; /* --time, -t */ char *partition; /* --partition=n, -p n */ -- GitLab