From 7844a935dabbcb996d78ef745752d5b43b0e7626 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 9 Dec 2005 17:26:01 +0000
Subject: [PATCH] Added support for task affinity for binding tasks to CPUs
 (Daniel     Palermo, HP).

---
 NEWS                                      |   5 +
 configure.ac                              |   4 +
 doc/man/man1/srun.1                       |  64 ++++++
 doc/man/man5/slurm.conf.5                 |   3 +-
 etc/slurm.conf.example                    |  11 +-
 slurm.spec                                |   2 +
 slurm/slurm.h.in                          |   8 +
 src/common/env.c                          |  58 ++++++
 src/common/env.h                          |   3 +
 src/common/slurm_protocol_defs.c          |   1 +
 src/common/slurm_protocol_defs.h          |   2 +
 src/common/slurm_protocol_pack.c          |   4 +
 src/plugins/task/Makefile.am              |   2 +-
 src/plugins/task/affinity/Makefile.am     |  19 ++
 src/plugins/task/affinity/affinity.c      | 239 ++++++++++++++++++++++
 src/plugins/task/affinity/affinity.h      |  83 ++++++++
 src/plugins/task/affinity/schedutils.c    | 132 ++++++++++++
 src/plugins/task/affinity/task_affinity.c | 140 +++++++++++++
 src/slurmd/slurmstepd/mgr.c               |   2 +-
 src/slurmd/slurmstepd/slurmstepd_job.c    |   8 +
 src/slurmd/slurmstepd/slurmstepd_job.h    |   2 +
 src/slurmd/slurmstepd/task.c              |   7 +
 src/srun/launch.c                         |   2 +
 src/srun/opt.c                            | 138 ++++++++++++-
 src/srun/opt.h                            |   2 +
 25 files changed, 929 insertions(+), 12 deletions(-)
 create mode 100644 src/plugins/task/affinity/Makefile.am
 create mode 100644 src/plugins/task/affinity/affinity.c
 create mode 100644 src/plugins/task/affinity/affinity.h
 create mode 100644 src/plugins/task/affinity/schedutils.c
 create mode 100644 src/plugins/task/affinity/task_affinity.c

diff --git a/NEWS b/NEWS
index d46a5578ba0..6cbe21a3a4f 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,11 @@
 This file describes changes in recent versions of SLURM. It primarily
 documents those changes that are of interest to users and admins. 
 
+* Changes in SLURM 0.7.0-pre6
+=============================
+ -- Added support for task affinity for binding tasks to CPUs (Daniel
+    Palermo, HP).
+
 * Changes in SLURM 0.7.0-pre5
 =============================
  -- Enhanced performance and debugging for slurmctld reconfiguration.
diff --git a/configure.ac b/configure.ac
index 0dfbe96b172..7af066370c5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -92,6 +92,9 @@ AC_CHECK_DECLS([hstrerror, strsignal, sys_siglist])
 AC_CHECK_FUNCS(unsetenv, [have_unsetenv=yes])
 AM_CONDITIONAL(HAVE_UNSETENV, test "x$have_unsetenv" = "xyes")
 
+AC_CHECK_FUNCS(sched_setaffinity, [have_sched_setaffinity=yes])
+AM_CONDITIONAL(HAVE_SCHED_SETAFFINITY, test "x$have_sched_setaffinity" = "xyes")
+
 ACX_PTHREAD([], AC_MSG_ERROR([Error: Cannot figure out how to use pthreads!]))
 
 # Always define WITH_PTHREADS if we make it this far
@@ -239,6 +242,7 @@ AC_CONFIG_FILES([Makefile
 		 src/plugins/mpi/lam/Makefile
 		 src/plugins/mpi/none/Makefile
 		 src/plugins/task/Makefile
+		 src/plugins/task/affinity/Makefile
 		 src/plugins/task/none/Makefile
 		 doc/Makefile
 		 doc/man/Makefile
diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1
index db57087f75d..29707bb2613 100644
--- a/doc/man/man1/srun.1
+++ b/doc/man/man1/srun.1
@@ -475,6 +475,58 @@ Request that a specific list of hosts not be included in the resources
 allocated to this job. The host list will be assumed to be a filename 
 if it contains a "/"character.
 
+.PP
+Affinity/Multi-core Options: (when the task/affinity plugin is enabled)
+.TP
+\fB\-\-cpu_bind\fR=[{\fIquiet,verbose\fR},]\fItype\fR
+Bind tasks to CPUs
+.RS
+.TP
+.B q[uiet],
+quietly bind before task runs (default)
+.TP
+.B v[erbose],
+verbosely report binding before task runs
+.TP
+.B no[ne]
+don't bind tasks to CPUs (default)
+.TP
+.B rank
+bind by task rank
+.TP
+.B map_cpu:<list>
+bind by mapping CPU IDs to tasks as specified
+where <list> is <cpuid1>,<cpuid2>,...<cpuidN>.
+CPU IDs are interpreted as decimal values unless they are preceded
+with '0x' in which case they interpreted as hexadecimal values.
+.TP
+.B mask_cpu:<list>
+bind by setting CPU masks on tasks as specified
+where <list> is <mask1>,<mask2>,...<maskN>.
+CPU masks are \fBalways\fR interpreted as hexadecimal values but can be
+preceded with an optional '0x'.
+.RE
+
+To have SLURM always report on the selected binding for all srun commands
+executed in a shell, you can also enable verbose mode separately from
+the command line with:
+.PP
+.nf
+        setenv SLURM_CPU_BIND verbose
+.fi
+.PP
+SLURM_CPU_BIND will not propagate into the tasks environment (binding
+by default only affects the first srun).  To propagate --cpubind to
+successive srun commands, first do the following in each task:
+.PP
+.nf
+ setenv SLURM_CPU_BIND \\
+   ${SLURM_CPU_BIND_VERBOSE},${SLURM_CPU_BIND_TYPE}${SLURM_CPU_BIND_LIST}
+.fi
+.PP
+See the \fBENVIRONMENT VARIABLES\fR section for a more detailed description
+of the individual SLURM_CPU_BIND* variables.
+
 .PP
 The following options support AIX systems, but may be applicable to 
 other systems as well. Since POE is used to launch tasks, these 
@@ -704,6 +756,9 @@ The location of the SLURM configuration file.
 .TP
 \fBSLURM_ACCOUNT\fR
 \fB\-U, \-\-account\fR=\fIaccount\fR
+.TP
+\fBSLURM_CPU_BIND\fR
+\fB\-U, \-\-cpu_bind\fR=\fItype\fR
 .TP 20
 \fBSLURM_CPUS_PER_TASK\fR
 \fB\-c, \-\-ncpus\-per\-task\fR=\fIn\fR
@@ -776,6 +831,15 @@ Additionally,
 will set some environment variables  in the environment of the
 executing tasks on the remote compute nodes. These environment variables
 are:
+.TP
+\fBSLURM_CPU_BIND_VERBOSE\fR
+--cpu_bind verbosity (quiet,verbose).
+.TP
+\fBSLURM_CPU_BIND_TYPE\fR
+--cpu_bind type (none,rank,map_cpu:,mask_cpu:)
+.TP
+\fBSLURM_CPU_BIND_LIST\fR
+--cpu_bind map or mask list (<list of IDs or masks for this node>)
 .TP 20
 \fBSLURM_CPUS_ON_NODE\fR
 Count of processors available to the job on this node
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 98dd95b7e4e..6c4e7f52c67 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -461,7 +461,8 @@ Identifies the type of task launch plugin, typically used to provide
 resource management within a node (e.g. pinning tasks to specific 
 processors).
 Acceptable values include
-"task/none" for systems requiring no special handling.
+"task/none" for systems requiring no special handling
+or "tasks/affinity" to enable the --cpu_bind CPU affinity srun option.
 The default value is "task/none".
 The order of task prolog/epilog execution is as follows:
 .RS
diff --git a/etc/slurm.conf.example b/etc/slurm.conf.example
index 1db9cc67ffd..23c1ef1f3b5 100644
--- a/etc/slurm.conf.example
+++ b/etc/slurm.conf.example
@@ -450,20 +450,21 @@ JobAcctType=jobacct/none
 #
 # o Define task launch specific parameters
 #
-#    "TaskProlog" : Define a program to be executed as root before each 
+#    "TaskProlog" : Define a program to be executed as the user before each 
 #                   task begins execution.
-#    "TaskEpilog" : Define a program to be executed as root after each 
+#    "TaskEpilog" : Define a program to be executed as the user after each 
 #                   task terminates.
 #    "TaskPlugin" : Define a task launch plugin. This may be used to 
 #                   provide resource management within a node (e.g. pinning
-#                   tasks to specific processors). Permissible values are
-#      "task/none" : no task launch actions, the default.
+#                   tasks to specific processors). Permissible values are:
+#      "task/none"     : no task launch actions, the default.
+#      "task/affinity" : CPU affinity support (see "srun --cpu_bind=")
 #
 # Example:
 #
 # TaskProlog=/usr/local/slurm/etc/task_prolog # default is none
 # TaskEpilog=/usr/local/slurm/etc/task_epilog # default is none
-# TaskPlugin=task/none                        # default is task/none
+# TaskPlugin=task/affinity                    # default is task/none
 
 
 #
diff --git a/slurm.spec b/slurm.spec
index ab76e0725df..24faaeb13a6 100644
--- a/slurm.spec
+++ b/slurm.spec
@@ -159,6 +159,8 @@ touch $LIST
 if [ -d /etc/init.d ]; then
    echo "%config(noreplace) /etc/init.d/slurm" >> $LIST
 fi
+test -f $RPM_BUILD_ROOT/%{_libdir}/slurm/task_affinity.so &&
+   echo %{_libdir}/slurm/task_affinity.so >> $LIST
 
 # Build file lists for optional plugin packages
 for plugin in auth_munge auth_authd sched_wiki; do
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 42df944b0ac..2fae1c36f24 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -211,6 +211,14 @@ enum task_dist_states {
 	SLURM_DIST_UNKNOWN	/* unknown dist */
 };
 
+typedef enum cpu_bind_type {	/* cpu binding type from --cpu_bind=... */
+	CPU_BIND_VERBOSE= 0x01,	/* =v, */
+	CPU_BIND_NONE	= 0x02,	/* =no */
+	CPU_BIND_RANK  	= 0x04,	/* =rank */
+	CPU_BIND_MAPCPU	= 0x08,	/* =map_cpu:<list of CPU IDs> */
+	CPU_BIND_MASKCPU= 0x10,	/* =mask_cpu:<list of CPU masks> */
+} cpu_bind_type_t;
+
 /* The last entry in node_states must be STATE_END, keep in sync with 
  * node_state_string. values may be ORed with NODE_STATE_FLAGS below. 
  * Node states typically alternate between NODE_STATE_IDLE and 
diff --git a/src/common/env.c b/src/common/env.c
index ea18139fa19..44145291e6d 100644
--- a/src/common/env.c
+++ b/src/common/env.c
@@ -183,6 +183,9 @@ void unsetenvp(char **env, const char *name)
 {
 	char **ep;
 
+	if (env == NULL)
+		return;
+
 	ep = env;
 	while ((ep = _find_name_in_env (ep, name)) && (*ep != NULL)) {
 		char **dp = ep;
@@ -260,6 +263,61 @@ int setup_env(env_t *env)
 		}
 	}
 
+	if (env->cpu_bind_type) {
+		unsetenvp(env->env, "SLURM_CPU_BIND");	/* don't propagate SLURM_CPU_BIND */
+		int setstat = 0;
+		if (env->cpu_bind_type & CPU_BIND_VERBOSE) {
+			setstat |= setenvf(&env->env, "SLURM_CPU_BIND_VERBOSE", "verbose");
+		} else {
+			setstat |= setenvf(&env->env, "SLURM_CPU_BIND_VERBOSE", "quiet");
+		}
+		if (setstat) {
+			error("Unable to set SLURM_CPU_BIND_VERBOSE");
+			rc = SLURM_FAILURE;
+		}
+
+		setstat = 0;
+		if (env->cpu_bind_type & CPU_BIND_NONE) {
+			setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "none");
+		} else if (env->cpu_bind_type & CPU_BIND_RANK) {
+			setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "rank");
+		} else if (env->cpu_bind_type & CPU_BIND_MAPCPU) {
+			setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "map_cpu:");
+		} else if (env->cpu_bind_type & CPU_BIND_MASKCPU) {
+			setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "mask_cpu:");
+		} else if (env->cpu_bind_type & (~CPU_BIND_VERBOSE)) {
+			setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "unknown");
+		} else {
+			setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "");
+		}
+		if (setstat) {
+			error("Unable to set SLURM_CPU_BIND_TYPE");
+			rc = SLURM_FAILURE;
+		}
+
+		setstat = 0;
+		if (env->cpu_bind) {
+			setstat |= setenvf(&env->env, "SLURM_CPU_BIND_LIST", env->cpu_bind);
+		} else {
+			setstat |= setenvf(&env->env, "SLURM_CPU_BIND_LIST", "");
+		}
+		if (setenvf(&env->env, "SLURM_CPU_BIND_LIST", env->cpu_bind)) {
+			error("Unable to set SLURM_CPU_BIND_LIST");
+			rc = SLURM_FAILURE;
+		}
+	} else {
+		unsetenvp(env->env, "SLURM_CPU_BIND");	/* don't propagate SLURM_CPU_BIND */
+		/* set SLURM_CPU_BIND_* env vars to defaults */
+		int setstat = 0;
+		setstat |= setenvf(&env->env, "SLURM_CPU_BIND_VERBOSE", "quiet");
+		setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "");
+		setstat |= setenvf(&env->env, "SLURM_CPU_BIND_LIST", "");
+		if (setstat) {
+			error("Unable to clear SLURM_CPU_BIND_*");
+			rc = SLURM_FAILURE;
+		}
+	}
+
 	if (env->overcommit 
 	    && (setenvf(&env->env, "SLURM_OVERCOMMIT", "1"))) {
 		error("Unable to set SLURM_OVERCOMMIT environment variable");
diff --git a/src/common/env.h b/src/common/env.h
index 109b125a877..0dace0a6a1b 100644
--- a/src/common/env.h
+++ b/src/common/env.h
@@ -46,6 +46,9 @@ typedef struct env_options {
 	bool cpus_set;		/* true if cpus_per_task explicitly set */
 	enum distribution_t
 		distribution;	/* --distribution=, -m dist	*/
+	cpu_bind_type_t
+		cpu_bind_type;	/* --cpu_bind=			*/
+	char *cpu_bind;		/* binding map for map/mask_cpu	*/
 	bool overcommit;	/* --overcommit,   -O		*/
 	int  slurmd_debug;	/* --slurmd-debug, -D           */
 	bool labelio;		/* --label-output, -l		*/
diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c
index c7de886dcbb..7f1103a74fc 100644
--- a/src/common/slurm_protocol_defs.c
+++ b/src/common/slurm_protocol_defs.c
@@ -323,6 +323,7 @@ void slurm_free_launch_tasks_request_msg(launch_tasks_request_msg_t * msg)
 		xfree(msg->env);
 	}
 	xfree(msg->cwd);
+	xfree(msg->cpu_bind);
 	if (msg->argv) {
 		for (i = 0; i < msg->argc; i++) {
 			xfree(msg->argv[i]);
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index 99957aaef6b..516f4587dc7 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -275,6 +275,8 @@ typedef struct launch_tasks_request_msg {
 	char    **env;
 	char    **argv;
 	char     *cwd;
+	cpu_bind_type_t cpu_bind_type;	/* --cpu_bind=                    */
+	char     *cpu_bind;	/* binding map for map/mask_cpu           */
 	uint16_t  resp_port;
 	uint16_t  io_port;
 	uint16_t  task_flags;
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index 27557487289..ade9ee703c4 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -2496,6 +2496,8 @@ _pack_launch_tasks_request_msg(launch_tasks_request_msg_t * msg, Buf buffer)
 	pack32(msg->tasks_to_launch, buffer);
 	packstr_array(msg->env, msg->envc, buffer);
 	packstr(msg->cwd, buffer);
+	pack32(msg->cpu_bind_type, buffer);
+	packstr(msg->cpu_bind, buffer);
 	packstr_array(msg->argv, msg->argc, buffer);
 	pack16(msg->resp_port, buffer);
 	pack16(msg->io_port, buffer);
@@ -2537,6 +2539,8 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t **
 	safe_unpack32(&msg->tasks_to_launch, buffer);
 	safe_unpackstr_array(&msg->env, &msg->envc, buffer);
 	safe_unpackstr_xmalloc(&msg->cwd, &uint16_tmp, buffer);
+	safe_unpack32(&msg->cpu_bind_type, buffer);
+	safe_unpackstr_xmalloc(&msg->cpu_bind, &uint16_tmp, buffer);
 	safe_unpackstr_array(&msg->argv, &msg->argc, buffer);
 	safe_unpack16(&msg->resp_port, buffer);
 	safe_unpack16(&msg->io_port, buffer);
diff --git a/src/plugins/task/Makefile.am b/src/plugins/task/Makefile.am
index 12ad6f77069..922e64385a3 100644
--- a/src/plugins/task/Makefile.am
+++ b/src/plugins/task/Makefile.am
@@ -1,4 +1,4 @@
 # $Id: Makefile.am 4554 2005-03-08 14:25:17Z jking $
 # Makefile for task plugins
 
-SUBDIRS = none
+SUBDIRS = affinity none
diff --git a/src/plugins/task/affinity/Makefile.am b/src/plugins/task/affinity/Makefile.am
new file mode 100644
index 00000000000..4a0e9dd393e
--- /dev/null
+++ b/src/plugins/task/affinity/Makefile.am
@@ -0,0 +1,19 @@
+# $Id: Makefile.am,v 1.1 2005/11/04 02:17:53 palermo Exp $
+# Makefile for task/affinity plugin
+
+AUTOMAKE_OPTIONS = foreign
+
+PLUGIN_FLAGS = -module -avoid-version --export-dynamic 
+
+INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common
+
+if HAVE_SCHED_SETAFFINITY
+pkglib_LTLIBRARIES = task_affinity.la
+task_affinity_la_SOURCES = task_affinity.c affinity.c schedutils.c affinity.h
+task_affinity_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS)
+
+else
+pkglib_LTLIBRARIES =
+EXTRA_task_affinity_la_SOURCES = task_affinity.c affinity.c schedutils.c affinity.h
+
+endif
diff --git a/src/plugins/task/affinity/affinity.c b/src/plugins/task/affinity/affinity.c
new file mode 100644
index 00000000000..3e967afaa0e
--- /dev/null
+++ b/src/plugins/task/affinity/affinity.c
@@ -0,0 +1,239 @@
+/*****************************************************************************\
+ *  src/plugins/task/affinity/affinity.c - task affinity plugin
+ *  $Id: affinity.c,v 1.2 2005/11/04 02:46:51 palermo Exp $
+ *****************************************************************************
+ *  Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
+ *  
+ *  This file is part of SLURM, a resource management program.
+ *  For details, see <http://www.llnl.gov/linux/slurm/>.
+ *  
+ *  SLURM is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *  
+ *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+ *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ *  details.
+ *  
+ *  You should have received a copy of the GNU General Public License along
+ *  with SLURM; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
+\*****************************************************************************/
+#include "affinity.h"
+
+void slurm_chkaffinity(cpu_set_t *mask, slurmd_job_t *job, int statval)
+{
+	char bind_type[42];
+	char status[42];
+	char prefix[42];
+	char suffix[42];
+	char mstr[1 + CPU_SETSIZE / 4];
+	int task_id = job->envtp->procid;
+	pid_t mypid = job->envtp->task_pid;
+
+	if (!(job->cpu_bind_type & CPU_BIND_VERBOSE)) return;
+
+	status[0] = '\0';
+	prefix[0] = '\0';
+	suffix[0] = '\0';
+	if (statval) { strcpy(status, "FAILED "); }
+
+	if (job->cpu_bind_type & CPU_BIND_NONE) {
+		strcpy(bind_type, "set to NO");
+		strcpy(prefix, "current ");
+		sprintf(suffix, "is mask 0x");
+	} else {
+		strcpy(prefix, "setting ");
+		sprintf(suffix, "to mask 0x");
+		if (job->cpu_bind_type & CPU_BIND_RANK) {
+			strcpy(bind_type, "set to RANK");
+		} else if (job->cpu_bind_type & CPU_BIND_MAPCPU) {
+			strcpy(bind_type, "set to MAP_CPU");
+		} else if (job->cpu_bind_type & CPU_BIND_MASKCPU) {
+			strcpy(bind_type, "set to MASK_CPU");
+		} else if (job->cpu_bind_type & (~CPU_BIND_VERBOSE)) {
+			strcpy(bind_type, "set to UNKNOWN");
+		} else {
+			strcpy(bind_type, "not set");
+			strcpy(prefix, "current ");
+			sprintf(suffix, "is mask 0x");
+		}
+	}
+
+	fprintf(stderr, "SLURM_CPU_BIND_TYPE %s, "
+			"%s%saffinity of task %u pid %u on host %s %s%s\n",
+			bind_type,
+			status,
+			prefix,
+			task_id,
+			mypid,
+			conf->hostname,
+			suffix,
+			cpuset_to_str(mask, mstr));
+}
+
+int get_cpuset(cpu_set_t *mask, slurmd_job_t *job)
+{
+	int nummasks, maskid, i;
+	char *curstr, *selstr;
+	char mstr[1 + CPU_SETSIZE / 4];
+	int local_id = job->envtp->localid;
+
+	debug3("get_cpuset (%d) %s\n", job->cpu_bind_type, job->cpu_bind);
+	CPU_ZERO(mask);
+
+	if (job->cpu_bind_type & CPU_BIND_NONE) {
+		return true;
+	}
+
+	if (job->cpu_bind_type & CPU_BIND_RANK) {
+		CPU_SET(job->envtp->localid % job->cpus, mask);
+		return true;
+	}
+
+	if (!job->cpu_bind)
+		return false;
+
+	nummasks = 1;
+	maskid = 0;
+	selstr = NULL;
+
+	/* get number of strings present in cpu_bind */
+	curstr = job->cpu_bind;
+	while (*curstr) {
+		if (nummasks == local_id+1) {
+			selstr = curstr;
+			maskid = local_id;
+			break;
+		}
+		if (*curstr == ',')
+			nummasks++;
+		curstr++;
+	}
+
+	/* if we didn't already find the mask... */
+	if (!selstr) {
+		/* ...select mask string by wrapping task ID into list */
+		maskid = local_id % nummasks;
+		i = maskid;
+		curstr = job->cpu_bind;
+		while (*curstr && i) {
+			if (*curstr == ',')
+			    	i--;
+			curstr++;
+		}
+		if (!*curstr) {
+			return false;
+		}
+		selstr = curstr;
+	}
+
+	/* extract the selected mask from the list */
+	i = 0;
+	curstr = mstr;
+	while (*selstr && *selstr != ',' && i++ < (CPU_SETSIZE/4))
+		*curstr++ = *selstr++;
+	*curstr = '\0';
+
+	if (job->cpu_bind_type & CPU_BIND_MASKCPU) {
+		/* convert mask string into cpu_set_t mask */
+		if (str_to_cpuset(mask, mstr) < 0) {
+			error("str_to_cpuset %s", mstr);
+			return false;
+		}
+		return true;
+	}
+
+	if (job->cpu_bind_type & CPU_BIND_MAPCPU) {
+		unsigned int mycpu = 0;
+		if (strncmp(mstr, "0x", 2) == 0) {
+			mycpu = strtoul (&(mstr[2]), NULL, 16);
+		} else {
+			mycpu = strtoul (mstr, NULL, 10);
+		}
+		CPU_SET(mycpu, mask);
+		return true;
+	}
+
+	return false;
+}
+
+/* user_older_affinity
+ *
+ * NOTE: some confusion in this.
+ * At first it seems:
+ * if glibc 2.3.2 then
+ *     call sched_setaffinity(pid,mask)
+ * else
+ *     call sched_setaffinity(pid,len,mask)
+ * but then some 2.4 kernels also have the
+ * 3 arg version - so its a mess.
+ */
+#if defined __GLIBC__
+#include <gnu/libc-version.h>		/* for gnu_get_libc_version */
+#endif
+bool use_3arg_affinity()
+{
+	static bool has_3arg_affinity = true;
+	static bool already_checked   = false;
+	if (already_checked) {
+	    	return has_3arg_affinity;
+	}
+#if defined __GLIBC__
+	const char *glibc_vers = gnu_get_libc_version();
+	if (glibc_vers != NULL) {
+	    	int scnt = 0, major = 0, minor = 0, point = 0;
+		scnt = sscanf (glibc_vers, "%d.%d.%d", &major,
+			       &minor, &point);
+		if (scnt == 3) {
+			if ((major <= 2) && (minor <= 3) && (point <= 2)) {
+				has_3arg_affinity = false;
+			}
+		}
+		debug3("glibc version: %d.%d.%d (%d)\n",
+				major, minor, point, has_3arg_affinity);
+	}
+#endif
+	already_checked = true;
+	return has_3arg_affinity;
+}
+
+int slurm_setaffinity(pid_t pid, size_t size, const cpu_set_t *mask)
+{
+	int (*fptr_sched_setaffinity)() = sched_setaffinity;
+	int rval = 0;
+        if (use_3arg_affinity()) {
+                rval = (*fptr_sched_setaffinity)(pid, size, mask);
+        } else {
+                rval = (*fptr_sched_setaffinity)(pid, mask);
+        }
+
+	char mstr[1 + CPU_SETSIZE / 4];
+	if (rval)
+		verbose("sched_setaffinity(%d,%d,0x%s) failed with status %d",
+				pid, size, cpuset_to_str(mask, mstr), rval);
+	return (rval);
+}
+
+int slurm_getaffinity(pid_t pid, size_t size, cpu_set_t *mask)
+{
+	int (*fptr_sched_getaffinity)() = sched_getaffinity;
+	int rval = 0;
+	CPU_ZERO(mask);
+        if (use_3arg_affinity()) {
+                rval = (*fptr_sched_getaffinity)(pid, size, mask);
+        } else {
+                rval = (*fptr_sched_getaffinity)(pid, mask);
+        }
+
+	char mstr[1 + CPU_SETSIZE / 4];
+	if (rval)
+		verbose("sched_getaffinity(%d,%d,0x%s) failed with status %d",
+				pid, size, cpuset_to_str(mask, mstr), rval);
+
+	debug3("sched_getaffinity(%d) = 0x%s", pid, cpuset_to_str(mask, mstr));
+	return (rval);
+}
+
diff --git a/src/plugins/task/affinity/affinity.h b/src/plugins/task/affinity/affinity.h
new file mode 100644
index 00000000000..f87b541aff4
--- /dev/null
+++ b/src/plugins/task/affinity/affinity.h
@@ -0,0 +1,83 @@
+/*****************************************************************************\
+ *  src/plugins/task/affinity/affinity.h - task affinity plugin
+ *  $Id: affinity.h,v 1.2 2005/11/04 02:46:51 palermo Exp $
+ *****************************************************************************
+ *  Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
+ *  
+ *  This file is part of SLURM, a resource management program.
+ *  For details, see <http://www.llnl.gov/linux/slurm/>.
+ *  
+ *  SLURM is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *  
+ *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+ *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ *  details.
+ *  
+ *  You should have received a copy of the GNU General Public License along
+ *  with SLURM; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
+\*****************************************************************************/
+#if HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#if HAVE_SYS_TYPES_H
+#  include <sys/types.h>
+#endif
+
+#if HAVE_SYS_PRCTL_H
+#  include <sys/prctl.h>
+#endif
+
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <sys/poll.h>
+#include <unistd.h>
+#include <pwd.h>
+#include <grp.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/utsname.h>
+
+#define __USE_GNU
+#include <sched.h> /* SMB */
+#undef __USE_GNU
+
+#if HAVE_STDLIB_H
+#  include <stdlib.h>
+#endif
+
+#include <slurm/slurm_errno.h>
+#include "src/common/slurm_xlator.h"
+#include "src/slurmd/slurmd/slurmd.h"
+#include "src/slurmd/slurmstepd/slurmstepd_job.h"
+
+#include "src/common/cbuf.h"
+#include "src/common/hostlist.h"
+#include "src/common/log.h"
+#include "src/common/node_select.h"
+#include "src/common/fd.h"
+#include "src/common/safeopen.h"
+#include "src/common/slurm_jobacct.h"
+#include "src/common/switch.h"
+#include "src/common/xsignal.h"
+#include "src/common/xstring.h"
+#include "src/common/xmalloc.h"
+#include "src/common/util-net.h"
+
+/*** from affinity.c ***/
+void	slurm_chkaffinity(cpu_set_t *mask, slurmd_job_t *job, int statval);
+int	get_cpuset(cpu_set_t *mask, slurmd_job_t *job);
+bool	use_3arg_affinity();
+int	slurm_setaffinity(pid_t pid, size_t size, const cpu_set_t *mask);
+int	slurm_getaffinity(pid_t pid, size_t size, cpu_set_t *mask);
+
+/*** from schedutils.c ***/
+int	str_to_cpuset(cpu_set_t *mask, const char* str);
+char *	cpuset_to_str(const cpu_set_t *mask, char *str);
+
diff --git a/src/plugins/task/affinity/schedutils.c b/src/plugins/task/affinity/schedutils.c
new file mode 100644
index 00000000000..1471834965e
--- /dev/null
+++ b/src/plugins/task/affinity/schedutils.c
@@ -0,0 +1,132 @@
+/*****************************************************************************\
+ *  src/plugins/task/affinity/schedutils.c - scheduling utilities
+ *  $Id: schedutils.c,v 1.2 2005/11/04 02:46:51 palermo Exp $
+ *****************************************************************************
+ *  Routines in this file are taken from the taskset utility (schedutils pkg)
+ *  Copyright (C) 2004 Robert Love
+ *
+ *  This file is part of SLURM, a resource management program.
+ *  For details, see <http://www.llnl.gov/linux/slurm/>.
+ *  
+ *  SLURM is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *  
+ *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+ *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ *  details.
+ *  
+ *  You should have received a copy of the GNU General Public License along
+ *  with SLURM; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
+\*****************************************************************************/
+#include "affinity.h"
+
+/*
+ * taskset.c - taskset
+ * Command-line utility for setting and retrieving a task's CPU affinity
+ *
+ * Robert Love <rml@tech9.net>		25 April 2002
+ *
+ * Linux kernels as of 2.5.8 provide the needed syscalls for
+ * working with a task's cpu affinity.  Currently 2.4 does not
+ * support these syscalls, but patches are available at:
+ *
+ * 	http://www.kernel.org/pub/linux/kernel/people/rml/cpu-affinity/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, v2, as
+ * published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Copyright (C) 2004 Robert Love
+ */
+
+static inline int val_to_char(int v)
+{
+	if (v >= 0 && v < 10)
+		return '0' + v;
+	else if (v >= 10 && v < 16)
+		return ('a' - 10) + v;
+	else 
+		return -1;
+}
+
+static inline int char_to_val(int c)
+{
+	int cl;
+
+	cl = tolower(c);
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	else if (cl >= 'a' && cl <= 'f')
+		return cl + (10 - 'a');
+	else
+		return -1;
+}
+
+int str_to_cpuset(cpu_set_t *mask, const char* str)
+{
+	int len = strlen(str);
+	const char *ptr = str + len - 1;
+	int base = 0;
+
+	/* skip 0x, it's all hex anyway */
+	if (len > 1 && !memcmp(str, "0x", 2L))
+		str += 2;
+
+	CPU_ZERO(mask);
+	while (ptr >= str) {
+		char val = char_to_val(*ptr);
+		if (val == (char) -1)
+			return -1;
+		if (val & 1)
+			CPU_SET(base, mask);
+		if (val & 2)
+			CPU_SET(base + 1, mask);
+		if (val & 4)
+			CPU_SET(base + 2, mask);
+		if (val & 8)
+			CPU_SET(base + 3, mask);
+		len--;
+		ptr--;
+		base += 4;
+	}
+
+	return 0;
+}
+
+char * cpuset_to_str(const cpu_set_t *mask, char *str)
+{
+	int base;
+	char *ptr = str;
+	char *ret = 0;
+
+	for (base = CPU_SETSIZE - 4; base >= 0; base -= 4) {
+		char val = 0;
+		if (CPU_ISSET(base, mask))
+			val |= 1;
+		if (CPU_ISSET(base + 1, mask))
+			val |= 2;
+		if (CPU_ISSET(base + 2, mask))
+			val |= 4;
+		if (CPU_ISSET(base + 3, mask))
+			val |= 8;
+		if (!ret && val)
+			ret = ptr;
+		*ptr++ = val_to_char(val);
+	}
+	*ptr = 0;
+	return ret ? ret : ptr - 1;
+}
+
diff --git a/src/plugins/task/affinity/task_affinity.c b/src/plugins/task/affinity/task_affinity.c
new file mode 100644
index 00000000000..7efa0f45203
--- /dev/null
+++ b/src/plugins/task/affinity/task_affinity.c
@@ -0,0 +1,140 @@
+/*****************************************************************************\
+ *  task_affinity.c - Library for task pre-launch and post_termination
+ *	functions for task affinity support
+ *****************************************************************************
+ *  Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
+ *  Modified by Hewlett-Packard for task affinity support using task_none.c
+ *  Copyright (C) 2005 The Regents of the University of California and
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  task_none.c Written by Morris Jette <jette1@llnl.gov>. 
+ *  UCRL-CODE-2002-040.
+ *  
+ *  This file is part of SLURM, a resource management program.
+ *  For details, see <http://www.llnl.gov/linux/slurm/>.
+ *  
+ *  SLURM is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *  
+ *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+ *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ *  details.
+ *  
+ *  You should have received a copy of the GNU General Public License along
+ *  with SLURM; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
+\*****************************************************************************/
+
+#if     HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include <signal.h>
+#include <sys/types.h>
+
+#include "affinity.h"
+
+/*
+ * These variables are required by the generic plugin interface.  If they
+ * are not found in the plugin, the plugin loader will ignore it.
+ *
+ * plugin_name - a string giving a human-readable description of the
+ * plugin.  There is no maximum length, but the symbol must refer to
+ * a valid string.
+ *
+ * plugin_type - a string suggesting the type of the plugin or its
+ * applicability to a particular form of data or method of data handling.
+ * If the low-level plugin API is used, the contents of this string are
+ * unimportant and may be anything.  SLURM uses the higher-level plugin
+ * interface which requires this string to be of the form
+ *
+ *      <application>/<method>
+ *
+ * where <application> is a description of the intended application of
+ * the plugin (e.g., "task" for task control) and <method> is a description 
+ * of how this plugin satisfies that application.  SLURM will only load
+ * a task plugin if the plugin_type string has a prefix of "task/".
+ *
+ * plugin_version - an unsigned 32-bit integer giving the version number
+ * of the plugin.  If major and minor revisions are desired, the major
+ * version number may be multiplied by a suitable magnitude constant such
+ * as 100 or 1000.  Various SLURM versions will likely require a certain
+ * minimum versions for their plugins as this API matures.
+ */
+const char plugin_name[]        = "task affinity plugin";
+const char plugin_type[]        = "task/affinity";
+const uint32_t plugin_version   = 100;
+
+/*
+ * init() is called when the plugin is loaded, before any other functions
+ *	are called.  Put global initialization here.
+ */
+int init ( void )
+{
+	verbose("%s loaded", plugin_name);
+	use_3arg_affinity();	/* initialize check for older affinity */
+	return SLURM_SUCCESS;
+}
+
+/*
+ * fini() is called when the plugin is removed. Clear any allocated 
+ *	storage here.
+ */
+int fini ( void )
+{
+	verbose("%s unloaded", plugin_name);
+	return SLURM_SUCCESS;
+}
+
+/*
+ * task_pre_launch() is called prior to exec of application task.
+ *	It is followed by TaskProlog program (from slurm.conf) and
+ *	--task-prolog (from srun command line).
+ */
+int task_pre_launch ( slurmd_job_t *job )
+{
+	FILE *fp;
+
+	debug("affinity task_pre_launch: %u.%u, task %d", 
+		job->jobid, job->stepid, job->envtp->procid);
+
+	/*** CPU binding support ***/
+	if (job->cpu_bind_type) {	
+		cpu_set_t new_mask, cur_mask;
+		pid_t mypid  = job->envtp->task_pid;
+
+		int setval = 0;
+		slurm_getaffinity(mypid, sizeof(cur_mask), &cur_mask);
+
+		if (get_cpuset(&new_mask, job)) {
+			if (!(job->cpu_bind_type & CPU_BIND_NONE)) {
+				setval = slurm_setaffinity(mypid,
+						sizeof(new_mask), &new_mask);
+				slurm_getaffinity(mypid,
+						sizeof(cur_mask), &cur_mask);
+			}
+		}
+		slurm_chkaffinity(setval ? &new_mask : &cur_mask, job, setval);
+	}
+
+	fp = fopen("/tmp/testit", "w");
+	fprintf(fp, "hello\n");
+	fclose(fp);
+	return SLURM_SUCCESS;
+}
+
+/*
+ * task_term() is called after termination of application task.
+ *	It is preceeded by --task-epilog (from srun command line)
+ *	followed by TaskEpilog program (from slurm.conf).
+ */
+int task_post_term ( slurmd_job_t *job )
+{
+	debug("affinity task_post_term: %u.%u, task %d",
+		job->jobid, job->stepid, job->envtp->procid);
+
+	return SLURM_SUCCESS;
+}
+
diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c
index 3d7aa08d4b7..88e08b67cb9 100644
--- a/src/slurmd/slurmstepd/mgr.c
+++ b/src/slurmd/slurmstepd/mgr.c
@@ -607,7 +607,7 @@ _fork_all_tasks(slurmd_job_t *job)
 			if (_become_user(job) < 0) 
 				exit(2);
 
-			log_fini();
+			/* log_fini(); */ /* note: moved into exec_task() */
 
 			xsignal_unblock(slurmstepd_blocked_signals);
 
diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c
index 78a4f9e6107..7f78446aa94 100644
--- a/src/slurmd/slurmstepd/slurmstepd_job.c
+++ b/src/slurmd/slurmstepd/slurmstepd_job.c
@@ -170,6 +170,8 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr)
 	job->uid     = (uid_t) msg->uid;
 	job->gid     = (gid_t) msg->gid;
 	job->cwd     = xstrdup(msg->cwd);
+	job->cpu_bind_type = msg->cpu_bind_type;
+	job->cpu_bind = xstrdup(msg->cpu_bind);
 
 	job->env     = _array_copy(msg->envc, msg->env);
 	job->eio     = eio_handle_create();
@@ -189,6 +191,8 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr)
 	job->envtp->procid = -1;
 	job->envtp->localid = -1;
 	job->envtp->nodeid = -1;	
+	job->envtp->cpu_bind_type = 0;
+	job->envtp->cpu_bind = NULL;
 	
 	memcpy(&resp_addr, cli_addr, sizeof(slurm_addr));
 	slurm_set_addr(&resp_addr, msg->resp_port, NULL); 
@@ -262,6 +266,8 @@ job_spawn_create(spawn_task_request_msg_t *msg, slurm_addr *cli_addr)
 	job->envtp->procid = -1;
 	job->envtp->localid = -1;
 	job->envtp->nodeid = -1;
+	job->envtp->cpu_bind_type = 0;
+	job->envtp->cpu_bind = NULL;
 	
 	memcpy(&io_addr,   cli_addr, sizeof(slurm_addr));
 	slurm_set_addr(&io_addr,   msg->io_port,   NULL); 
@@ -348,6 +354,8 @@ job_batch_job_create(batch_job_launch_msg_t *msg)
 	job->envtp->procid = -1;
 	job->envtp->localid = -1;
 	job->envtp->nodeid = -1;
+	job->envtp->cpu_bind_type = 0;
+	job->envtp->cpu_bind = NULL;
 	
 	srun = srun_info_create(NULL, NULL, NULL);
 
diff --git a/src/slurmd/slurmstepd/slurmstepd_job.h b/src/slurmd/slurmstepd/slurmstepd_job.h
index 262a8f15d9d..a6ae03bd210 100644
--- a/src/slurmd/slurmstepd/slurmstepd_job.h
+++ b/src/slurmd/slurmstepd/slurmstepd_job.h
@@ -104,6 +104,8 @@ typedef struct slurmd_job {
 	char         **env;    /* job environment                           */
 	char         **argv;   /* job argument vector                       */
 	char          *cwd;    /* path to current working directory         */
+	cpu_bind_type_t cpu_bind_type; /* --cpu_bind=                       */
+	char          *cpu_bind;       /* binding map for map/mask_cpu      */
 	switch_jobinfo_t switch_job; /* switch-specific job information     */
 	uid_t         uid;     /* user id for job                           */
 	gid_t         gid;     /* group ID for job                          */
diff --git a/src/slurmd/slurmstepd/task.c b/src/slurmd/slurmstepd/task.c
index a6a180fd859..52e231ae2ca 100644
--- a/src/slurmd/slurmstepd/task.c
+++ b/src/slurmd/slurmstepd/task.c
@@ -250,12 +250,14 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
 		      job->cwd);
 		if (chdir("/tmp") < 0) {
 			error("couldn't chdir to /tmp either. dying.");
+			log_fini();
 			exit(4);
 		}
 	}
 
 	if ((!job->spawn_task) && (set_user_limits(job) < 0)) {
 		debug("Unable to set user limits");
+		log_fini();
 		exit(5);
 	}
 
@@ -268,6 +270,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
 	 */
         if ((rc = read (waitfd, &c, sizeof (c))) != 1) {
 	        error ("_exec_task read failed, fd = %d, rc=%d: %m", waitfd, rc);
+		log_fini();
 		exit(1);
 	}
 	close(waitfd);
@@ -284,6 +287,8 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
 	job->envtp->procid = t->gtid;
 	job->envtp->localid = t->id;
 	job->envtp->task_pid = getpid();
+	job->envtp->cpu_bind = xstrdup(job->cpu_bind);
+	job->envtp->cpu_bind_type = job->cpu_bind_type;
 	
 	setup_env(job->envtp);
 	job->env = job->envtp->env;
@@ -295,6 +300,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
 				job->nodeid, (uint32_t) i, job->nnodes,
 				job->nprocs, job->task[i]->gtid) < 0) {
 			error("Unable to attach to interconnect: %m");
+			log_fini();
 			exit(1);
 		}
 
@@ -326,6 +332,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
 		_run_script("user task_prolog", job->task_prolog, job); 
 	}
 
+	log_fini();
 	execve(job->argv[0], job->argv, job->env);
 
 	/* 
diff --git a/src/srun/launch.c b/src/srun/launch.c
index 238c5f5addb..c313311ba0d 100644
--- a/src/srun/launch.c
+++ b/src/srun/launch.c
@@ -147,6 +147,8 @@ launch(void *arg)
 		r->switch_job      = job->switch_job;
 		r->task_prolog     = opt.task_prolog;
 		r->task_epilog     = opt.task_epilog;
+		r->cpu_bind_type   = opt.cpu_bind_type;
+		r->cpu_bind        = opt.cpu_bind;
 
 		r->ofname  = fname_remote_string (job->ofname);
 		r->efname  = fname_remote_string (job->efname);
diff --git a/src/srun/opt.c b/src/srun/opt.c
index 0acafd19637..3aefbfb9d43 100644
--- a/src/srun/opt.c
+++ b/src/srun/opt.c
@@ -81,6 +81,7 @@
 #define OPT_NO_ROTATE	0x0a
 #define OPT_GEOMETRY	0x0b
 #define OPT_MPI         0x0c
+#define OPT_CPU_BIND    0x0d
 
 /* generic getopt_long flags, integers and *not* valid characters */
 #define LONG_OPT_HELP     0x100
@@ -107,11 +108,13 @@
 #define LONG_OPT_PROLOG   0x117
 #define LONG_OPT_EPILOG   0x118
 #define LONG_OPT_BEGIN    0x119
-#define LONG_OPT_MAIL_TYPE 0x11a
-#define LONG_OPT_MAIL_USER 0x11b
+#define LONG_OPT_MAIL_TYPE   0x11a
+#define LONG_OPT_MAIL_USER   0x11b
 #define LONG_OPT_TASK_PROLOG 0x11c
 #define LONG_OPT_TASK_EPILOG 0x11d
 #define LONG_OPT_NICE        0x11e
+#define LONG_OPT_CPU_BIND    0x11f
+
 
 /*---- forward declarations of static functions  ----*/
 
@@ -159,6 +162,8 @@ static void  _usage(void);
 static bool  _valid_node_list(char **node_list_pptr);
 static enum  task_dist_states _verify_dist_type(const char *arg);
 static bool  _verify_node_count(const char *arg, int *min, int *max);
+static int   _verify_cpu_bind(const char *arg, char **cpu_bind,
+					cpu_bind_type_t *cpu_bind_type);
 static int   _verify_geometry(const char *arg, int *geometry);
 static int   _verify_conn_type(const char *arg);
 
@@ -303,6 +308,100 @@ static int _verify_geometry(const char *arg, int *geometry)
 	return rc;
 }
 
+/*
+ * verify cpu_bind arguments
+ * returns -1 on error, 0 otherwise
+ */
+static int _verify_cpu_bind(const char *arg, char **cpu_bind, cpu_bind_type_t *cpu_bind_type)
+{
+    	char *buf = xstrdup(arg);
+	char *pos = buf;
+	/* we support different launch policy names
+	 * we also allow a verbose setting to be specified
+	 *     -cpu_bind=v
+	 *     -cpu_bind=rank,v
+	 *     -cpu_bind=rank
+	 *     -cpu_bind={MAP_CPU|MAP_MASK}:0,1,2,3,4
+	 */
+	if (*pos) {
+		/* parse --cpu_bind command line arguments */
+		bool fl_cpubind_verbose = 0;
+	        char *cmd_line_affinity = NULL;
+	        char *cmd_line_mapping  = NULL;
+		char *mappos = strchr(pos,':');
+		if (!mappos) {
+		    	mappos = strchr(pos,'=');
+		}
+		if (strncasecmp(pos, "quiet", 5) == 0) {
+			fl_cpubind_verbose=0;
+			pos+=5;
+		} else if (*pos=='q' || *pos=='Q') {
+			fl_cpubind_verbose=0;
+			pos++;
+		}
+		if (strncasecmp(pos, "verbose", 7) == 0) {
+			fl_cpubind_verbose=1;
+			pos+=7;
+		} else if (*pos=='v' || *pos=='V') {
+			fl_cpubind_verbose=1;
+			pos++;
+		}
+		if (*pos==',') {
+			pos++;
+		}
+		if (*pos) {
+			char *vpos=NULL;
+			cmd_line_affinity = pos;
+			if (((vpos=strstr(pos,",q")) !=0  ) ||
+			    ((vpos=strstr(pos,",Q")) !=0  )) {
+				*vpos='\0';
+				fl_cpubind_verbose=0;
+			}
+			if (((vpos=strstr(pos,",v")) !=0  ) ||
+			    ((vpos=strstr(pos,",V")) !=0  )) {
+				*vpos='\0';
+				fl_cpubind_verbose=1;
+			}
+		}
+		if (mappos) {
+			*mappos='\0'; 
+			mappos++;
+			cmd_line_mapping=mappos;
+		}
+
+		/* convert parsed command line args into interface */
+		if (cmd_line_mapping) {
+			xfree(*cpu_bind);
+			*cpu_bind = xstrdup(cmd_line_mapping);
+		}
+		if (fl_cpubind_verbose) {
+		        *cpu_bind_type |= CPU_BIND_VERBOSE;
+		}
+		if (cmd_line_affinity) {
+			*cpu_bind_type &= CPU_BIND_VERBOSE;	/* clear any previous type */
+			if ((strcasecmp(cmd_line_affinity, "no") == 0) ||
+			    (strcasecmp(cmd_line_affinity, "none") == 0)) {
+				*cpu_bind_type |= CPU_BIND_NONE;
+			} else if (strcasecmp(cmd_line_affinity, "rank") == 0) {
+				*cpu_bind_type |= CPU_BIND_RANK;
+			} else if ((strcasecmp(cmd_line_affinity, "map_cpu") == 0) ||
+			           (strcasecmp(cmd_line_affinity, "mapcpu") == 0)) {
+				*cpu_bind_type |= CPU_BIND_MAPCPU;
+			} else if ((strcasecmp(cmd_line_affinity, "mask_cpu") == 0) ||
+			           (strcasecmp(cmd_line_affinity, "maskcpu") == 0)) {
+				*cpu_bind_type |= CPU_BIND_MASKCPU;
+			} else {
+				error("unrecognized --cpu_bind argument \"%s\"", cmd_line_affinity);
+				xfree(buf);
+				return 1;
+			}
+		}
+	}
+
+	xfree(buf);
+	return 0;
+}
+
 /* 
  * verify that a node count in arg is of a known form (count or min-max)
  * OUT min, max specified minimum and maximum node counts
@@ -455,6 +554,8 @@ static void _opt_default()
 	opt.min_nodes = 1;
 	opt.max_nodes = 0;
 	opt.nodes_set = false;
+	opt.cpu_bind_type = 0;
+	opt.cpu_bind = NULL;
 	opt.time_limit = -1;
 	opt.partition = NULL;
 	opt.max_threads = MAX_THREADS;
@@ -564,6 +665,7 @@ env_vars_t env_vars[] = {
   {"SLURM_CPUS_PER_TASK", OPT_INT,        &opt.cpus_per_task, &opt.cpus_set  },
   {"SLURM_CONN_TYPE",     OPT_CONN_TYPE,  NULL,               NULL           },
   {"SLURM_CORE_FORMAT",   OPT_CORE,       NULL,               NULL           },
+  {"SLURM_CPU_BIND",      OPT_CPU_BIND,   NULL,               NULL           },
   {"SLURM_DEBUG",         OPT_DEBUG,      NULL,               NULL           },
   {"SLURM_DISTRIBUTION",  OPT_DISTRIB,    NULL,               NULL           },
   {"SLURM_GEOMETRY",      OPT_GEOMETRY,   NULL,               NULL           },
@@ -647,6 +749,12 @@ _process_env_var(env_vars_t *e, const char *val)
 		    opt.distribution = dt;
 	    break;
 
+	case OPT_CPU_BIND:
+	    if (_verify_cpu_bind(val, &opt.cpu_bind,
+					    &opt.cpu_bind_type))
+		    exit(1);
+	    break;
+
 	case OPT_NODES:
 	    opt.nodes_set = _verify_node_count( val, 
 			                        &opt.min_nodes, 
@@ -764,7 +872,8 @@ void set_options(const int argc, char **argv, int first)
 		{"disable-status", no_argument,      0, 'X'},
 		{"no-allocate",   no_argument,       0, 'Z'},
 		{"contiguous",       no_argument,       0, LONG_OPT_CONT},
-                {"exclusive",        no_argument,       0, LONG_OPT_EXCLUSIVE},
+		{"exclusive",        no_argument,       0, LONG_OPT_EXCLUSIVE},
+		{"cpu_bind",         required_argument, 0, LONG_OPT_CPU_BIND},
 		{"core",             required_argument, 0, LONG_OPT_CORE},
 		{"mincpus",          required_argument, 0, LONG_OPT_MINCPU},
 		{"mem",              required_argument, 0, LONG_OPT_MEM},
@@ -1078,6 +1187,11 @@ void set_options(const int argc, char **argv, int first)
                 case LONG_OPT_EXCLUSIVE:
                         opt.exclusive = true;
                         break;
+                case LONG_OPT_CPU_BIND:
+			if (_verify_cpu_bind(optarg, &opt.cpu_bind,
+							&opt.cpu_bind_type))
+				exit(1);
+			break;
 		case LONG_OPT_CORE:
 			opt.core_type = core_format_type (optarg);
 			if (opt.core_type == CORE_INVALID)
@@ -1621,6 +1735,8 @@ static void _opt_list()
 	     opt.partition == NULL ? "default" : opt.partition);
 	info("job name       : `%s'", opt.job_name);
 	info("distribution   : %s", format_task_dist_states(opt.distribution));
+	info("cpu_bind       : %s", 
+	     opt.cpu_bind == NULL ? "default" : opt.cpu_bind);
 	info("core format    : %s", core_format_name (opt.core_type));
 	info("verbose        : %d", _verbose);
 	info("slurmd_debug   : %d", opt.slurmd_debug);
@@ -1692,12 +1808,13 @@ static void _usage(void)
 "            [--core=type] [-T threads] [-W sec] [--attach] [--join] \n"
 "            [--contiguous] [--mincpus=n] [--mem=MB] [--tmp=MB] [-C list]\n"
 "            [--mpi=type] [--account=name] [--dependency=jobid]\n"
-"            [--kill-on-bad-exit] [--propagate[=rlimits] ]\n"
+"            [--kill-on-bad-exit] [--propagate[=rlimits] ] [--cpu_bind=...]\n"
 #ifdef HAVE_BG		/* Blue gene specific options */
 "            [--geometry=XxYxZ] [--conn-type=type] [--no-rotate]\n"
 #endif
 "            [--mail-type=type] [--mail-user=user][--nice[=value]]\n"
 "            [--prolog=fname] [--epilog=fname]\n"
+"            [--task-prolog=fname] [--task-epilog=fname]\n"
 "            [-w hosts...] [-x hosts...] executable [args...]\n");
 }
 
@@ -1749,6 +1866,8 @@ static void _help(void)
 "      --mpi=type              specifies version of MPI to use\n"
 "      --prolog=program        run \"program\" before launching job step\n"
 "      --epilog=program        run \"program\" after launching job step\n"
+"      --task-prolog=program   run \"program\" before launching task\n"
+"      --task-epilog=program   run \"program\" after launching task\n"
 "      --begin=time            defer job until HH:MM DD/MM/YY\n"
 "      --mail-type=type        notify on state change: BEGIN, END, FAIL or ALL\n"
 "      --mail-user=user        who to send email notification for job state changes\n"
@@ -1776,6 +1895,17 @@ static void _help(void)
 "      --exclusive             allocate nodes in exclusive mode when\n" 
 "                              cpu consumable resource is enabled\n"
 "\n"
+"Affinity/Multi-core options: (when the task/affinity plugin is enabled)\n" 
+"      --cpu_bind=             Bind tasks to CPUs\n" 
+"             q[uiet],           quietly bind before task runs (default)\n"
+"             v[erbose],         verbosely report binding before task runs\n"
+"             no[ne]             don't bind tasks to CPUs (default)\n"
+"             rank               bind by task rank\n"
+"             map_cpu:<list>     bind by mapping CPU IDs to tasks as specified\n"
+"                                where <list> is <cpuid1>,<cpuid2>,...<cpuidN>\n"
+"             mask_cpu:<list>    bind by setting CPU masks on tasks as specified\n"
+"                                where <list> is <mask1>,<mask2>,...<maskN>\n"
+"\n"
 #ifdef HAVE_AIX				/* AIX/Federation specific options */
   "AIX related options:\n"
   "  --network=type              communication protocol to be used\n"
diff --git a/src/srun/opt.h b/src/srun/opt.h
index 806f32c4b3a..97e87d1e4cb 100644
--- a/src/srun/opt.h
+++ b/src/srun/opt.h
@@ -95,6 +95,8 @@ typedef struct srun_options {
 	int  max_threads;	/* --threads, -T (threads in srun) */
 	int  min_nodes;		/* --nodes=n,       -N n	*/ 
 	int  max_nodes;		/* --nodes=x-n,       -N x-n	*/ 
+	cpu_bind_type_t cpu_bind_type; /* --cpu_bind=           */
+	char *cpu_bind;		/* binding map for map/mask_cpu */
 	bool nodes_set;		/* true if nodes explicitly set */
 	int  time_limit;	/* --time,   -t			*/
 	char *partition;	/* --partition=n,   -p n   	*/
-- 
GitLab