diff --git a/doc/man/man5/cgroup.conf.5 b/doc/man/man5/cgroup.conf.5
index 92e96dc04be28dba7493a1848bcad7b5cad61a0a..5b647cfc7f98183741555b4026529783f45e9e45 100644
--- a/doc/man/man5/cgroup.conf.5
+++ b/doc/man/man5/cgroup.conf.5
@@ -25,6 +25,12 @@ one is a proctrack plugin, the second one a task plugin.
 The following cgroup.conf parameters are defined to control the general behavior
 of Slurm cgroup plugins.
 
+.TP
+\fBCgroupMountpoint\fR=\fIPATH\fR
+Specify the \fIPATH\fR under which cgroups should be mounted. This
+should be a writeable directory which will contain cgroups mounted
+one per subsystem. The default \fIPATH\fR is /cgroup.
+
 .TP
 \fBCgroupAutomount\fR=<yes|no>
 Slurm cgroup plugins require valid and functional cgroup subsystem to be mounted
@@ -105,17 +111,19 @@ would be added :
 .TP
 \fBAllowedRAMSpace\fR=<number>
 Constrain the job cgroup RAM to this percentage of the allocated memory.
-The default value is 100.
-If the limit is exceeded, the job steps will be killed and a warning message
-will be written to standard error.
-Also see \fBConstrainRAMSpace\fR.
+The default value is 100. If SLURM is not allocating memory to jobs,
+The percentage supplied may be expressed as floating point
+number, e.g. 98.5. If the \fBAllowedRAMSpace\fR limit is exceeded, the
+job steps will be killed and a warning message will be written to standard
+error.  Also see \fBConstrainRAMSpace\fR.
 
 .TP
 \fBAllowedSwapSpace\fR=<number>
-Constrain the job cgroup swap space to this percentage of the allocated memory.
-The default value is 0.
-If the limit is exceeded, the job steps will be killed and a warning message
-will be written to standard error.
+Constrain the job cgroup swap space to this percentage of the allocated
+memory.  The default value is 0, which means that RAM+Swap will be limited
+to \fBAllowedRAMSpace\fR. The supplied percentage may be expressed as a
+floating point number, e.g. 50.5.  If the limit is exceeded, the job steps
+will be killed and a warning message will be written to standard error.
 Also see \fBConstrainSwapSpace\fR.
 
 .TP
@@ -130,6 +138,28 @@ If configured to "yes" then constrain the job's swap space usage.
 The default value is "no".
 Also see \fBAllowedSwapSpace\fR.
 
+.TP
+\fBMaxRAMPercent\fR=\fIPERCENT\fR
+Set an upper bound in percent of total RAM on the RAM constraint for a job.
+This will be the memory constraint applied to jobs that are not explicitly
+allocated memory by SLURM. The \fIPERCENT\fR may be an arbitrary floating
+point number. The default value is 100.
+
+.TP
+\fBMaxSwapPercent\fR=\fIPERCENT\fR
+Set an upper bound (in percent of total RAM) on the amount of RAM+Swap
+that may be used for a job. This will be the swap limit applied to jobs
+on systems where memory is not being explicitly allocated to job. The
+\fIPERCENT\fR may be an arbitrary floating point number between 0 and 100.
+The default value is 100.
+
+.TP
+\fBMinRAMSpace\fR=<number>
+Set a lower bound (in MB) on the memory limits defined by
+\fBAllowedRAMSpace\fR and \fBAllowedSwapSpace\fR. This prevents
+accidentally creating a memory cgroup with such a low limit that slurmstepd
+is immediately killed due to lack of RAM. The default limit is 30M.
+
 .TP
 \fBConstrainDevices\fR=<yes|no>
 If configured to "yes" then constrain the job's allowed devices based on GRES
diff --git a/etc/cgroup.release_common.example b/etc/cgroup.release_common.example
index 6c6c4d03a25cbd0c31f11910f705db2068eac7c0..f431d26855b51c00accca33218f9656829af02df 100644
--- a/etc/cgroup.release_common.example
+++ b/etc/cgroup.release_common.example
@@ -13,10 +13,20 @@
 # to ensure coherency of the cgroups contents.
 #
 
-base_path=/cgroup
 progname=$(basename $0)
 subsystem=${progname##*_}
-orphancg=${base_path}/${subsystem}/orphan
+
+get_mount_dir()
+{
+    local lssubsys=$(type -p lssubsys)
+    if [ -x $lssubsys ]; then
+        $lssubsys -m $subsystem | awk '{print $2}'
+    else
+        awk "/release_agent=$0/ { print \$2 }"
+    fi
+}
+
+mountdir=$(get_mount_dir)
 
 if [[ $# -eq 0 ]]
 then
@@ -24,14 +34,31 @@ then
     exit 1
 fi
 
+# build orphan cg path
+if [[ $# -eq 1 ]]
+then
+    rmcg=${mountdir}$1
+else
+    rmcg=${mountdir}$2
+fi
+slurmcg=${rmcg%/uid_*}
+if [[ ${slurmcg} == ${rmcg} ]]
+then
+    # not a slurm job pattern, perhaps the slurmcg, just remove 
+    # the dir with a lock and exit
+    flock -x ${mountdir} -c "rmdir ${rmcg}"
+    exit $?
+fi
+orphancg=${slurmcg}/orphan
+
 # make sure orphan cgroup is existing
 if [[ ! -d ${orphancg} ]]
 then
     mkdir ${orphancg}
     case ${subsystem} in 
 	cpuset)
-	    cat ${base_path}/${subsystem}/cpuset.cpus > ${orphancg}/cpuset.cpus
-	    cat ${base_path}/${subsystem}/cpuset.mems > ${orphancg}/cpuset.mems
+	    cat ${mountdir}/cpuset.cpus > ${orphancg}/cpuset.cpus
+	    cat ${mountdir}/cpuset.mems > ${orphancg}/cpuset.mems
 	    ;;
 	*)
 	    ;;
@@ -42,7 +69,7 @@ fi
 if [[ $# -eq 1 ]]
 then
 
-    rmcg=${base_path}/${subsystem}$@
+    rmcg=${mountdir}$@
 
     # try to extract the uid cgroup from the input one
     # ( extract /uid_% from /uid%/job_*...)
@@ -51,13 +78,13 @@ then
     then
 	# not a slurm job pattern, perhaps the uidcg, just remove 
 	# the dir with a lock and exit
-	flock -x ${base_path}/${subsystem} -c "rmdir ${rmcg}"
+	flock -x ${mountdir} -c "rmdir ${rmcg}"
 	exit $?
     fi
 
-    if [[ -d ${base_path}/${subsystem} ]]
+    if [[ -d ${mountdir} ]]
     then
-	flock -x ${base_path}/${subsystem} -c "$0 sync $@"
+	flock -x ${mountdir} -c "$0 sync $@"
     fi
 
     exit $?
@@ -68,7 +95,7 @@ elif [[ $# -eq 2 ]] && [[ $1 == "sync" ]]
 then
 
     shift
-    rmcg=${base_path}/${subsystem}$@
+    rmcg=${mountdir}$@
     uidcg=${rmcg%/job_*}
 
     # remove this cgroup
diff --git a/slurm.spec b/slurm.spec
index fe85d4cb6c398fa86b25e6cc1a7be6a250c13204..1333c864fd305a2840271838ef7520e6f9f8ac31 100644
--- a/slurm.spec
+++ b/slurm.spec
@@ -589,10 +589,10 @@ rm -rf $RPM_BUILD_ROOT
 %config %{_sysconfdir}/slurm.conf.example
 %config %{_sysconfdir}/cgroup.conf.example
 %config %{_sysconfdir}/cgroup_allowed_devices_file.conf.example
-%config %{_sysconfdir}/cgroup.release_common.example
-%config (noreplace) %{_sysconfdir}/cgroup/release_freezer
-%config (noreplace) %{_sysconfdir}/cgroup/release_cpuset
-%config (noreplace) %{_sysconfdir}/cgroup/release_memory
+%config (replace) %{_sysconfdir}/cgroup.release_common.example
+%config (replace) %{_sysconfdir}/cgroup/release_freezer
+%config (replace) %{_sysconfdir}/cgroup/release_cpuset
+%config (replace) %{_sysconfdir}/cgroup/release_memory
 %config %{_sysconfdir}/slurm.epilog.clean
 %exclude %{_mandir}/man1/sjobexit*
 %if %{slurm_with blcr}
diff --git a/src/common/xcgroup.c b/src/common/xcgroup.c
index 8cc8b6376d2ddd9a364f3e23b5469314c86549c1..3da09bda6b29a7672209c9f421dbdffea7a3904c 100644
--- a/src/common/xcgroup.c
+++ b/src/common/xcgroup.c
@@ -93,10 +93,13 @@ int _file_write_content(char* file_path, char* content, size_t csize);
  *  - XCGROUP_ERROR
  *  - XCGROUP_SUCCESS
  */
-int xcgroup_ns_create(xcgroup_ns_t* cgns, char* mnt_point, char* mnt_args,
+int xcgroup_ns_create(slurm_cgroup_conf_t *conf,
+		xcgroup_ns_t* cgns, char* mnt_point, char* mnt_args,
 		      char* subsys, char* notify_prog) {
 
-	cgns->mnt_point = xstrdup(mnt_point);
+	cgns->mnt_point = xstrdup(conf->cgroup_mountpoint);
+	xstrcat(cgns->mnt_point, mnt_point);
+
 	cgns->mnt_args = xstrdup(mnt_args);
 	cgns->subsystems = xstrdup(subsys);
 	cgns->notify_prog = xstrdup(notify_prog);
@@ -822,7 +825,8 @@ int _file_write_uint64s(char* file_path, uint64_t* values, int nb)
 		if (rc < 1) {
 			debug2("unable to add value '%s' to file '%s' : %m",
 			       tstr, file_path);
-			fstatus = XCGROUP_ERROR;
+			if ( errno != ESRCH )
+				fstatus = XCGROUP_ERROR;
 		}
 
 	}
@@ -942,7 +946,8 @@ int _file_write_uint32s(char* file_path, uint32_t* values, int nb)
 		if (rc < 1) {
 			debug2("unable to add value '%s' to file '%s' : %m",
 			       tstr, file_path);
-			fstatus = XCGROUP_ERROR;
+			if ( errno != ESRCH )
+				fstatus = XCGROUP_ERROR;
 		}
 
 	}
diff --git a/src/common/xcgroup.h b/src/common/xcgroup.h
index cea81f9990980fafae971c94fce0cb02d491ab4f..7b83d278889eb0a80395227bad85c8571feacae1 100644
--- a/src/common/xcgroup.h
+++ b/src/common/xcgroup.h
@@ -43,14 +43,11 @@
 
 #include <sys/types.h>
 #include <dirent.h>
+#include "xcgroup_read_config.h"
 
 #define XCGROUP_ERROR    1
 #define XCGROUP_SUCCESS  0
 
-#ifndef CGROUP_BASEDIR
-#define CGROUP_BASEDIR "/cgroup"
-#endif
-
 typedef struct xcgroup_ns {
 
 	char* mnt_point;  /* mount point to use for the associated cgroup */
@@ -80,7 +77,8 @@ typedef struct xcgroup {
  *  - XCGROUP_ERROR
  *  - XCGROUP_SUCCESS
  */
-int xcgroup_ns_create(xcgroup_ns_t* cgns,
+int xcgroup_ns_create(slurm_cgroup_conf_t *conf,
+		      xcgroup_ns_t* cgns,
 		      char* mnt_point,char* mnt_args,
 		      char* subsys,char* notify_prog);
 
diff --git a/src/common/xcgroup_read_config.c b/src/common/xcgroup_read_config.c
index 6f71140845259e6eca7a963e47a129d6611e86f3..48fdf7bbbf5b2025db06711708fb8bb24525eb0c 100644
--- a/src/common/xcgroup_read_config.c
+++ b/src/common/xcgroup_read_config.c
@@ -54,6 +54,8 @@
 
 #include "xcgroup_read_config.h"
 
+#define DEFAULT_CGROUP_BASEDIR "/cgroup"
+
 slurm_cgroup_conf_t *slurm_cgroup_conf = NULL;
 
 /* Local functions */
@@ -73,14 +75,19 @@ static void _clear_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf)
 {
 	if (slurm_cgroup_conf) {
 		slurm_cgroup_conf->cgroup_automount = false ;
+		xfree(slurm_cgroup_conf->cgroup_mountpoint);
 		xfree(slurm_cgroup_conf->cgroup_subsystems);
 		xfree(slurm_cgroup_conf->cgroup_release_agent);
+		xfree(slurm_cgroup_conf->cgroup_prepend);
 		slurm_cgroup_conf->constrain_cores = false ;
 		slurm_cgroup_conf->task_affinity = false ;
 		slurm_cgroup_conf->constrain_ram_space = false ;
 		slurm_cgroup_conf->allowed_ram_space = 100 ;
+		slurm_cgroup_conf->max_ram_percent = 100 ;
+		slurm_cgroup_conf->min_ram_space = XCGROUP_DEFAULT_MIN_RAM;
 		slurm_cgroup_conf->constrain_swap_space = false ;
 		slurm_cgroup_conf->allowed_swap_space = 0 ;
+		slurm_cgroup_conf->max_swap_percent = 100 ;
 		slurm_cgroup_conf->memlimit_enforcement = 0 ;
 		slurm_cgroup_conf->memlimit_threshold = 100 ;
 		slurm_cgroup_conf->constrain_devices = false ;
@@ -88,6 +95,34 @@ static void _clear_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf)
 	}
 }
 
+/*
+ *   Parse a floating point value in s and return in val
+ *    Return -1 on error and leave *val unchanged.
+ */
+static int str_to_float (char *s, float *val)
+{
+	float f;
+	char *p;
+
+	errno = 0;
+	f = strtof (s, &p);
+
+	if ((*p != '\0') || (errno != 0))
+		return (-1);
+
+	*val = f;
+	return (0);
+}
+
+static void conf_get_float (s_p_hashtbl_t *t, char *name, float *fp)
+{
+	char *str;
+	if (!s_p_get_string(&str, name, t))
+		return;
+	if (str_to_float (str, fp) < 0)
+		fatal ("cgroup.conf: Invalid value '%s' for %s", str, name);
+}
+
 /*
  * read_slurm_cgroup_conf - load the Slurm cgroup configuration from the
  *	cgroup.conf file.
@@ -97,17 +132,21 @@ extern int read_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf)
 {
 	s_p_options_t options[] = {
 		{"CgroupAutomount", S_P_BOOLEAN},
+		{"CgroupMountpoint", S_P_STRING},
 		{"CgroupSubsystems", S_P_STRING},
 		{"CgroupReleaseAgentDir", S_P_STRING},
 		{"ConstrainCores", S_P_BOOLEAN},
 		{"TaskAffinity", S_P_BOOLEAN},
 		{"ConstrainRAMSpace", S_P_BOOLEAN},
-		{"AllowedRAMSpace", S_P_UINT32},
+		{"AllowedRAMSpace", S_P_STRING},
+		{"MaxRAMPercent", S_P_STRING},
+		{"MinRAMSpace", S_P_UINT32},
 		{"ConstrainSwapSpace", S_P_BOOLEAN},
-		{"AllowedSwapSpace", S_P_UINT32},
+		{"AllowedSwapSpace", S_P_STRING},
+		{"MaxSwapPercent", S_P_STRING},
 		{"ConstrainCores", S_P_BOOLEAN},
 		{"MemoryLimitEnforcement", S_P_BOOLEAN},
-		{"MemoryLimitThreshold", S_P_UINT32},
+		{"MemoryLimitThreshold", S_P_STRING},
 		{"ConstrainDevices", S_P_BOOLEAN},
 		{"AllowedDevicesFile", S_P_STRING},
 		{NULL} };
@@ -137,8 +176,14 @@ extern int read_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf)
 
 		/* cgroup initialisation parameters */
 		if (!s_p_get_boolean(&slurm_cgroup_conf->cgroup_automount,
-				   "CgroupAutomount", tbl))
+			        "CgroupAutomount", tbl))
 			slurm_cgroup_conf->cgroup_automount = false;
+
+		if (!s_p_get_string(&slurm_cgroup_conf->cgroup_mountpoint,
+				"CgroupMountpoint", tbl))
+			slurm_cgroup_conf->cgroup_mountpoint =
+				xstrdup(DEFAULT_CGROUP_BASEDIR);
+
 		s_p_get_string(&slurm_cgroup_conf->cgroup_subsystems,
 			       "CgroupSubsystems", tbl);
 		s_p_get_string(&slurm_cgroup_conf->cgroup_release_agent,
@@ -147,6 +192,13 @@ extern int read_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf)
 			slurm_cgroup_conf->cgroup_release_agent =
 				xstrdup("/etc/slurm/cgroup");
 
+		/* cgroup prepend directory */
+#ifndef MULTIPLE_SLURMD
+		slurm_cgroup_conf->cgroup_prepend = xstrdup("/slurm");
+#else
+		slurm_cgroup_conf->cgroup_prepend = xstrdup("/slurm_%n");
+#endif
+
 		/* Cores constraints related conf items */
 		if (!s_p_get_boolean(&slurm_cgroup_conf->constrain_cores,
 				     "ConstrainCores", tbl))
@@ -159,23 +211,38 @@ extern int read_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf)
 		if (!s_p_get_boolean(&slurm_cgroup_conf->constrain_ram_space,
 				     "ConstrainRAMSpace", tbl))
 			slurm_cgroup_conf->constrain_ram_space = false;
-		if (!s_p_get_uint32(&slurm_cgroup_conf->allowed_ram_space,
-				    "AllowedRAMSpace", tbl))
-			slurm_cgroup_conf->allowed_ram_space = 100;
+
+		conf_get_float (tbl,
+				"AllowedRAMSpace",
+				&slurm_cgroup_conf->allowed_ram_space);
+
+		conf_get_float (tbl,
+				"MaxRAMPercent",
+				&slurm_cgroup_conf->max_ram_percent);
+
 		if (!s_p_get_boolean(&slurm_cgroup_conf->constrain_swap_space,
 				     "ConstrainSwapSpace", tbl))
 			slurm_cgroup_conf->constrain_swap_space = false;
-		if (!s_p_get_uint32(&slurm_cgroup_conf->allowed_swap_space,
-				    "AllowedSwapSpace", tbl))
-			slurm_cgroup_conf->allowed_swap_space = 0;
+
+		conf_get_float (tbl,
+				"AllowedSwapSpace",
+				&slurm_cgroup_conf->allowed_swap_space);
+
+		conf_get_float (tbl,
+				"MaxSwapPercent",
+				&slurm_cgroup_conf->max_swap_percent);
+
+		s_p_get_uint32 (&slurm_cgroup_conf->min_ram_space,
+		                "MinRAMSpace", tbl);
 
 		/* Memory limits */
 		if (!s_p_get_boolean(&slurm_cgroup_conf->memlimit_enforcement,
 				     "MemoryLimitEnforcement", tbl))
 			slurm_cgroup_conf->memlimit_enforcement = false;
-		if (!s_p_get_uint32(&slurm_cgroup_conf->memlimit_threshold,
-				    "MemoryLimitThreshold", tbl))
-			slurm_cgroup_conf->memlimit_threshold = 0;
+
+		conf_get_float (tbl,
+				"MemoryLimitThreshold",
+				&slurm_cgroup_conf->memlimit_threshold);
 
 		/* Devices constraint related conf items */
 		if (!s_p_get_boolean(&slurm_cgroup_conf->constrain_devices,
diff --git a/src/common/xcgroup_read_config.h b/src/common/xcgroup_read_config.h
index 7693b0e2f3bc3df018215eaae20332a10c7c95b0..a3d0738fa631c13e5cc202b508635ca4c57ba127 100644
--- a/src/common/xcgroup_read_config.h
+++ b/src/common/xcgroup_read_config.h
@@ -50,25 +50,37 @@
 #include <stdint.h>
 #endif  /* HAVE_CONFIG_H */
 
+/*  Default lower bound on memory limit in MB. This is required so we
+ *   don't immediately kill slurmstepd on mem cgroup creation if
+ *   an administrator or user sets and absurdly low mem limit.
+ */
+#define XCGROUP_DEFAULT_MIN_RAM 30
 
 /* Slurm cgroup plugins configuration parameters */
 typedef struct slurm_cgroup_conf {
 
 	bool      cgroup_automount;
+	char *    cgroup_mountpoint;
 	char *    cgroup_subsystems;
 	char *    cgroup_release_agent;
 
+	char *    cgroup_prepend;
+
 	bool      constrain_cores;
 	bool      task_affinity;
 
 	bool      constrain_ram_space;
-	uint32_t  allowed_ram_space;
+	float     allowed_ram_space;
+	float     max_ram_percent;       /* Upper bound on memory as % of RAM*/
+
+	uint32_t  min_ram_space;         /* Lower bound on memory limit (MB) */
 
 	bool      constrain_swap_space;
-	uint32_t  allowed_swap_space;
+	float     allowed_swap_space;
+	float     max_swap_percent;      /* Upper bound on swap as % of RAM  */
 
 	bool      memlimit_enforcement;
-	uint32_t  memlimit_threshold;
+	float     memlimit_threshold;
 
 	bool      constrain_devices;
 	char *    allowed_devices_file;
diff --git a/src/plugins/proctrack/cgroup/proctrack_cgroup.c b/src/plugins/proctrack/cgroup/proctrack_cgroup.c
index b38211be6b28ebb4552839a805c2eaa7c3350233..5de335def98c37d505681e1c1a8d66a15235c999 100644
--- a/src/plugins/proctrack/cgroup/proctrack_cgroup.c
+++ b/src/plugins/proctrack/cgroup/proctrack_cgroup.c
@@ -54,6 +54,7 @@
 
 #include "src/common/xcgroup_read_config.h"
 #include "src/common/xcgroup.h"
+#include "src/common/xstring.h"
 #include "src/common/xcpuinfo.h"
 
 #include <sys/types.h>
@@ -129,7 +130,7 @@ int _slurm_cgroup_init(void)
 	}
 
 	/* initialize freezer cgroup namespace */
-	if (xcgroup_ns_create(&freezer_ns, CGROUP_BASEDIR "/freezer", "",
+	if (xcgroup_ns_create(&slurm_cgroup_conf, &freezer_ns, "/freezer", "",
 			       "freezer", release_agent_path)
 	     != XCGROUP_SUCCESS) {
 		error("unable to create freezer cgroup namespace");
@@ -158,15 +159,42 @@ int _slurm_cgroup_init(void)
 
 int _slurm_cgroup_create(slurmd_job_t *job, uint64_t id, uid_t uid, gid_t gid)
 {
+	/* we do it here as we do not have access to the conf structure */
+	/* in libslurm (src/common/xcgroup.c) */
+	xcgroup_t slurm_cg;
+	char* pre = (char*) xstrdup(slurm_cgroup_conf.cgroup_prepend);
+#ifdef MULTIPLE_SLURMD
+	if ( conf->node_name != NULL )
+		xstrsubstitute(pre,"%n", conf->node_name);
+	else {
+		xfree(pre);
+		pre = (char*) xstrdup("/slurm");
+	}
+#endif
+
+	/* create slurm cgroup in the freezer ns (it could already exist) */
+	if (xcgroup_create(&freezer_ns, &slurm_cg,pre,
+			   getuid(), getgid()) != XCGROUP_SUCCESS) {
+		return SLURM_ERROR;
+	}
+	if (xcgroup_instanciate(&slurm_cg) != XCGROUP_SUCCESS) {
+		xcgroup_destroy(&slurm_cg);
+		return SLURM_ERROR;
+	}
+	else
+		xcgroup_destroy(&slurm_cg);
+
 	/* build user cgroup relative path if not set (should not be) */
 	if (*user_cgroup_path == '\0') {
 		if (snprintf(user_cgroup_path, PATH_MAX,
-			      "/uid_%u", uid) >= PATH_MAX) {
+			     "%s/uid_%u", pre, uid) >= PATH_MAX) {
 			error("unable to build uid %u cgroup relative "
 			      "path : %m", uid);
+			xfree(pre);
 			return SLURM_ERROR;
 		}
 	}
+	xfree(pre);
 
 	/* build job cgroup relative path if no set (should not be) */
 	if (*job_cgroup_path == '\0') {
@@ -240,13 +268,19 @@ int _slurm_cgroup_create(slurmd_job_t *job, uint64_t id, uid_t uid, gid_t gid)
 		return SLURM_ERROR;
 	}
 
+	/* inhibit release agent for the step cgroup thus letting 
+	 * slurmstepd being able to add new pids to the container 
+	 * when the job ends (TaskEpilog,...) */
+	xcgroup_set_param(&step_freezer_cg,"notify_on_release","0");
+
 	return SLURM_SUCCESS;
 }
 
 int _slurm_cgroup_destroy(void)
 {
 	if (jobstep_cgroup_path[0] != '\0') {
-		xcgroup_delete(&step_freezer_cg);
+		if ( xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS )
+			return SLURM_ERROR;
 		xcgroup_destroy(&step_freezer_cg);
 	}
 
@@ -500,8 +534,7 @@ extern int slurm_container_plugin_signal (uint64_t id, int signal)
 
 extern int slurm_container_plugin_destroy (uint64_t id)
 {
-	_slurm_cgroup_destroy();
-	return SLURM_SUCCESS;
+	return _slurm_cgroup_destroy();
 }
 
 extern uint64_t slurm_container_plugin_find(pid_t pid)
@@ -529,6 +562,7 @@ extern int slurm_container_plugin_wait(uint64_t cont_id)
 	}
 
 	/* Spin until the container is successfully destroyed */
+	/* This indicates that all tasks have exited the container */
 	while (slurm_container_plugin_destroy(cont_id) != SLURM_SUCCESS) {
 		slurm_container_plugin_signal(cont_id, SIGKILL);
 		sleep(delay);
diff --git a/src/plugins/task/cgroup/Makefile.am b/src/plugins/task/cgroup/Makefile.am
index f7cc3e07272bfc8970c028363b1ff9964ab5a903..1813b9a4f3e4b2548d7eee33ec6cce3e087bfd30 100644
--- a/src/plugins/task/cgroup/Makefile.am
+++ b/src/plugins/task/cgroup/Makefile.am
@@ -9,7 +9,7 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common
 pkglib_LTLIBRARIES = task_cgroup.la
 
 # cgroup task plugin.
-task_cgroup_la_SOURCES = 	task_cgroup.c \
+task_cgroup_la_SOURCES = 	task_cgroup.h task_cgroup.c \
 				task_cgroup_cpuset.h task_cgroup_cpuset.c \
 				task_cgroup_memory.h task_cgroup_memory.c \
 				task_cgroup_devices.h task_cgroup_devices.c
diff --git a/src/plugins/task/cgroup/task_cgroup.c b/src/plugins/task/cgroup/task_cgroup.c
index 0fbb3b90731a689a0510e772123b4202c32bd778..810927c09d27fbe2cf9a235ca6ab4223448461f0 100644
--- a/src/plugins/task/cgroup/task_cgroup.c
+++ b/src/plugins/task/cgroup/task_cgroup.c
@@ -45,10 +45,12 @@
 #include "slurm/slurm_errno.h"
 #include "src/common/slurm_xlator.h"
 #include "src/slurmd/slurmstepd/slurmstepd_job.h"
-
-#include "src/common/xcgroup_read_config.h"
+#include "src/slurmd/slurmd/slurmd.h"
 #include "src/common/xcgroup.h"
+#include "src/common/xstring.h"
+#include "src/common/xcgroup_read_config.h"
 
+#include "task_cgroup.h"
 #include "task_cgroup_cpuset.h"
 #include "task_cgroup_memory.h"
 #include "task_cgroup_devices.h"
@@ -278,3 +280,41 @@ extern int task_post_step (slurmd_job_t *job)
 	fini();
 	return SLURM_SUCCESS;
 }
+
+extern char* task_cgroup_create_slurm_cg (xcgroup_ns_t* ns) {
+
+	/* we do it here as we do not have access to the conf structure */
+	/* in libslurm (src/common/xcgroup.c) */
+	xcgroup_t slurm_cg;
+	char* pre = (char*) xstrdup(slurm_cgroup_conf.cgroup_prepend);
+#ifdef MULTIPLE_SLURMD
+	if ( conf->node_name != NULL )
+		xstrsubstitute(pre,"%n", conf->node_name);
+	else {
+		xfree(pre);
+		pre = (char*) xstrdup("/slurm");
+	}
+#endif
+
+	/* create slurm cgroup in the ns (it could already exist) */
+	if (xcgroup_create(ns,&slurm_cg,pre,
+			   getuid(), getgid()) != XCGROUP_SUCCESS) {
+		xfree(pre);
+		return pre;
+	}
+	if (xcgroup_instanciate(&slurm_cg) != XCGROUP_SUCCESS) {
+		error("unable to build slurm cgroup for ns %s: %m",
+		      ns->subsystems);
+		xcgroup_destroy(&slurm_cg);
+		xfree(pre);
+		return pre;
+	}
+	else {
+		debug3("slurm cgroup %s successfully created for ns %s: %m",
+		       pre,ns->subsystems);
+		xcgroup_destroy(&slurm_cg);
+	}
+
+exit:
+	return pre;
+}
diff --git a/src/plugins/task/cgroup/task_cgroup.h b/src/plugins/task/cgroup/task_cgroup.h
new file mode 100644
index 0000000000000000000000000000000000000000..a65d3a4f2edba0b55a54cb6f3afda0e5587be1e3
--- /dev/null
+++ b/src/plugins/task/cgroup/task_cgroup.h
@@ -0,0 +1,46 @@
+/*****************************************************************************\
+ *  task_cgroup.h - cgroup common primitives for task/cgroup
+ *****************************************************************************
+ *  Copyright (C) 2009 CEA/DAM/DIF
+ *  Written by Matthieu Hautreux <matthieu.hautreux@cea.fr>
+ *
+ *  This file is part of SLURM, a resource management program.
+ *  For details, see <http://www.schedmd.com/slurmdocs/>.
+ *  Please also read the included file: DISCLAIMER.
+ *
+ *  SLURM is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *
+ *  In addition, as a special exception, the copyright holders give permission
+ *  to link the code of portions of this program with the OpenSSL library under
+ *  certain conditions as described in each individual source file, and
+ *  distribute linked combinations including the two. You must obey the GNU
+ *  General Public License in all respects for all of the code used other than
+ *  OpenSSL. If you modify file(s) with this exception, you may extend this
+ *  exception to your version of the file(s), but you are not obligated to do
+ *  so. If you do not wish to do so, delete this exception statement from your
+ *  version.  If you delete this exception statement from all source files in
+ *  the program, then also delete it here.
+ *
+ *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+ *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ *  details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with SLURM; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
+\*****************************************************************************/
+
+#if HAVE_CONFIG_H
+#   include "config.h"
+#endif
+
+#ifndef _TASK_CGROUP_H_
+#define _TASK_CGROUP_H_
+
+extern char* task_cgroup_create_slurm_cg (xcgroup_ns_t* ns);
+
+#endif
diff --git a/src/plugins/task/cgroup/task_cgroup_cpuset.c b/src/plugins/task/cgroup/task_cgroup_cpuset.c
index 59f46512d39feb5668af5d31cb80f8e62f6fa471..78df78aeaf75d37752aadc286e01fa1f8ec15e3c 100644
--- a/src/plugins/task/cgroup/task_cgroup_cpuset.c
+++ b/src/plugins/task/cgroup/task_cgroup_cpuset.c
@@ -52,6 +52,8 @@
 #include "src/common/xcgroup.h"
 #include "src/common/xcpuinfo.h"
 
+#include "task_cgroup.h"
+
 #ifdef HAVE_HWLOC
 #include <hwloc.h>
 #include <hwloc/glibc-sched.h>
@@ -94,7 +96,7 @@ extern int task_cgroup_cpuset_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
 		error("task/cgroup: unable to build cpuset release agent path");
 		goto error;
 	}
-	if (xcgroup_ns_create(&cpuset_ns,CGROUP_BASEDIR "/cpuset","",
+	if (xcgroup_ns_create(slurm_cgroup_conf, &cpuset_ns, "/cpuset", "",
 			       "cpuset",release_agent_path) !=
 	     XCGROUP_SUCCESS) {
 		error("task/cgroup: unable to create cpuset namespace");
@@ -165,15 +167,45 @@ extern int task_cgroup_cpuset_create(slurmd_job_t *job)
 	char* cpus = NULL;
 	size_t cpus_size;
 
+	char* slurm_cgpath ;
+	xcgroup_t slurm_cg;
+
+	/* create slurm root cg in this cg namespace */
+	slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns);
+	if ( slurm_cgpath == NULL ) {
+		return SLURM_ERROR;
+	}
+
+	/* check that this cgroup has cpus allowed or initialize them */
+	if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath)
+	    != XCGROUP_SUCCESS) {
+		error("task/cgroup: unable to load slurm cpuset xcgroup");
+		xfree(slurm_cgpath);
+		return SLURM_ERROR;
+	}
+	rc = xcgroup_get_param(&slurm_cg,"cpuset.cpus",&cpus,&cpus_size);
+	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
+		/* initialize the cpusets as it was inexistant */
+		if (_xcgroup_cpuset_init(&slurm_cg) !=
+		    XCGROUP_SUCCESS) {
+			xfree(slurm_cgpath);
+			xcgroup_destroy(&slurm_cg);
+			return SLURM_ERROR;
+		}
+	}
+	xfree(cpus);
+
 	/* build user cgroup relative path if not set (should not be) */
 	if (*user_cgroup_path == '\0') {
-		if (snprintf(user_cgroup_path,PATH_MAX,
-			      "/uid_%u",uid) >= PATH_MAX) {
-			error("task/cgroup: unable to build uid %u cpuset "
-			      "cg relative path : %m",uid);
+		if (snprintf(user_cgroup_path, PATH_MAX,
+			     "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) {
+			error("unable to build uid %u cgroup relative "
+			      "path : %m", uid);
+			xfree(slurm_cgpath);
 			return SLURM_ERROR;
 		}
 	}
+	xfree(slurm_cgpath);
 
 	/* build job cgroup relative path if no set (should not be) */
 	if (*job_cgroup_path == '\0') {
diff --git a/src/plugins/task/cgroup/task_cgroup_devices.c b/src/plugins/task/cgroup/task_cgroup_devices.c
index 9053bec0e91bb5eff7cbcc0e63250145cd6b5e06..6a39f87eacf138cb486ff84e5ec7ecf2e592bbaa 100644
--- a/src/plugins/task/cgroup/task_cgroup_devices.c
+++ b/src/plugins/task/cgroup/task_cgroup_devices.c
@@ -56,11 +56,12 @@
 #include "src/common/gres.h"
 #include "src/common/list.h"
 
+#include "task_cgroup.h"
+
 #ifndef PATH_MAX
 #define PATH_MAX 256
 #endif
 
-
 static char user_cgroup_path[PATH_MAX];
 static char job_cgroup_path[PATH_MAX];
 static char jobstep_cgroup_path[PATH_MAX];
@@ -102,7 +103,7 @@ extern int task_cgroup_devices_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
 		error("task/cgroup: unable to build devices release agent path");
 		goto error;
 	}
-	if ( xcgroup_ns_create(&devices_ns,CGROUP_BASEDIR "/devices","",
+	if (xcgroup_ns_create(slurm_cgroup_conf, &devices_ns, "/devices","",
 			       "devices",release_agent_path) != 
 	     XCGROUP_SUCCESS ) {
 		error("task/cgroup: unable to create devices namespace");
@@ -179,16 +180,25 @@ extern int task_cgroup_devices_create(slurmd_job_t *job)
 	List job_gres_list = job->job_gres_list;
 	List step_gres_list = job->step_gres_list;
 
-	
+	char* slurm_cgpath ;
+
+	/* create slurm root cg in this cg namespace */
+	slurm_cgpath = task_cgroup_create_slurm_cg(&devices_ns);
+	if ( slurm_cgpath == NULL ) {
+		return SLURM_ERROR;
+	}
+
 	/* build user cgroup relative path if not set (should not be) */
-	if ( *user_cgroup_path == '\0' ) {
-		if ( snprintf(user_cgroup_path,PATH_MAX,
-			      "/uid_%u", uid) >= PATH_MAX ) {
-		error("task/cgroup: unable to build uid %u devices "
-		      "cg relative path : %m", uid);
+	if (*user_cgroup_path == '\0') {
+		if (snprintf(user_cgroup_path, PATH_MAX,
+			     "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) {
+			error("unable to build uid %u cgroup relative "
+			      "path : %m", uid);
+			xfree(slurm_cgpath);
 			return SLURM_ERROR;
 		}
 	}
+	xfree(slurm_cgpath);
 
 	/* build job cgroup relative path if no set (should not be) */
 	if ( *job_cgroup_path == '\0' ) {
diff --git a/src/plugins/task/cgroup/task_cgroup_memory.c b/src/plugins/task/cgroup/task_cgroup_memory.c
index 70de4c76f2d651e6c0160ec2b34b36320d243e4f..a7e0b0de711ef9cf869de112ab8ece60f0ac77fc 100644
--- a/src/plugins/task/cgroup/task_cgroup_memory.c
+++ b/src/plugins/task/cgroup/task_cgroup_memory.c
@@ -50,10 +50,14 @@
 #include "src/common/xcgroup_read_config.h"
 #include "src/common/xcgroup.h"
 
+#include "task_cgroup.h"
+
 #ifndef PATH_MAX
 #define PATH_MAX 256
 #endif
 
+extern slurmd_conf_t *conf;
+
 static char user_cgroup_path[PATH_MAX];
 static char job_cgroup_path[PATH_MAX];
 static char jobstep_cgroup_path[PATH_MAX];
@@ -64,9 +68,18 @@ static xcgroup_t user_memory_cg;
 static xcgroup_t job_memory_cg;
 static xcgroup_t step_memory_cg;
 
-static int allowed_ram_space;
-static int allowed_swap_space;
+static float allowed_ram_space;   /* Allowed RAM in percent       */
+static float allowed_swap_space;  /* Allowed Swap percent         */
+
+static uint64_t max_ram;        /* Upper bound for memory.limit_in_bytes  */
+static uint64_t max_swap;       /* Upper bound for swap                   */
+static uint64_t totalram;       /* Total real memory available on node    */
+static uint64_t min_ram_space;  /* Don't constrain RAM below this value       */
 
+static uint64_t percent_in_bytes (uint64_t mb, float percent)
+{
+	return ((mb * 1024 * 1024) * (percent / 100.0));
+}
 
 extern int task_cgroup_memory_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
 {
@@ -84,7 +97,7 @@ extern int task_cgroup_memory_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
 		error("task/cgroup: unable to build memory release agent path");
 		goto error;
 	}
-	if (xcgroup_ns_create(&memory_ns,CGROUP_BASEDIR "/memory","",
+	if (xcgroup_ns_create(slurm_cgroup_conf, &memory_ns, "/memory", "",
 			       "memory",release_agent_path) !=
 	     XCGROUP_SUCCESS) {
 		error("task/cgroup: unable to create memory namespace");
@@ -110,6 +123,25 @@ extern int task_cgroup_memory_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
 	allowed_ram_space = slurm_cgroup_conf->allowed_ram_space;
 	allowed_swap_space = slurm_cgroup_conf->allowed_swap_space;
 
+	if ((totalram = (uint64_t) conf->real_memory_size) == 0)
+		error ("task/cgroup: Unable to get RealMemory size");
+
+	max_ram = percent_in_bytes(totalram, slurm_cgroup_conf->max_ram_percent);
+	max_swap = percent_in_bytes(totalram, slurm_cgroup_conf->max_swap_percent);
+	max_swap += max_ram;
+	min_ram_space = slurm_cgroup_conf->min_ram_space * 1024 * 1024;
+
+	debug ("task/cgroup/memory: total:%luM allowed:%.4g%%, swap:%.4g%%, "
+	      "max:%.4g%%(%luM) max+swap:%.4g%%(%luM) min:%uM",
+	      (unsigned long) totalram,
+	      allowed_ram_space,
+	      allowed_swap_space,
+	      slurm_cgroup_conf->max_ram_percent,
+	      (unsigned long) (max_ram/(1024*1024)),
+	      slurm_cgroup_conf->max_swap_percent,
+	      (unsigned long) (max_swap/(1024*1024)),
+	      (unsigned) slurm_cgroup_conf->min_ram_space);
+
         /*
          *  Warning: OOM Killer must be disabled for slurmstepd
          *  or it would be destroyed if the application use
@@ -171,6 +203,76 @@ extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
 	return SLURM_SUCCESS;
 }
 
+/*
+ *  Return configured memory limit in bytes given a memory limit in MB.
+ */
+static uint64_t mem_limit_in_bytes (uint64_t mem)
+{
+	/* 
+	 *  If mem == 0 then assume there was no SLURM limit imposed
+	 *   on the amount of memory for job or step. Use the total
+	 *   amount of available RAM instead.
+	 */
+	if (mem == 0)
+		mem = totalram * 1024 * 1024;
+	else
+		mem = percent_in_bytes (mem, allowed_ram_space);
+	if (mem < min_ram_space)
+		return (min_ram_space);
+	if (mem > max_ram)
+		return (max_ram);
+	return (mem);
+}
+
+/*
+ *  Return configured swap limit in bytes given a memory limit in MB.
+ *
+ *   Swap limit is calculated as:
+ *
+ *     mem_limit_in_bytes + (configured_swap_percent * allocated_mem_in_bytes)
+ */
+static uint64_t swap_limit_in_bytes (uint64_t mem)
+{
+	uint64_t swap;
+	/*
+	 *  If mem == 0 assume "unlimited" and use totalram.
+	 */
+	swap = percent_in_bytes (mem ? mem : totalram, allowed_swap_space);
+	mem = mem_limit_in_bytes (mem) + swap;
+	if (mem < min_ram_space)
+		return (min_ram_space);
+	if (mem > max_swap)
+		return (max_swap);
+	return (mem);
+}
+
+static int memcg_initialize (xcgroup_ns_t *ns, xcgroup_t *cg,
+		char *path, uint64_t mem_limit, uid_t uid, gid_t gid)
+{
+	uint64_t mlb = mem_limit_in_bytes (mem_limit);
+	uint64_t mls = swap_limit_in_bytes  (mem_limit);
+
+	if (xcgroup_create (ns, cg, path, uid, gid) != XCGROUP_SUCCESS)
+		return -1;
+
+	if (xcgroup_instanciate (cg) != XCGROUP_SUCCESS) {
+		xcgroup_destroy (cg);
+		return -1;
+	}
+
+	xcgroup_set_param (cg, "memory.use_hierarchy","1");
+	xcgroup_set_uint64_param (cg, "memory.limit_in_bytes", mlb);
+	xcgroup_set_uint64_param (cg, "memory.memsw.limit_in_bytes", mls);
+
+	info ("task/cgroup: %s: alloc=%luMB mem.limit=%luMB memsw.limit=%luMB",
+		path,
+		(unsigned long) mem_limit,
+		(unsigned long) mlb/(1024*1024),
+		(unsigned long) mls/(1024*1024));
+
+	return 0;
+}
+
 extern int task_cgroup_memory_create(slurmd_job_t *job)
 {
 	int rc;
@@ -181,19 +283,28 @@ extern int task_cgroup_memory_create(slurmd_job_t *job)
 	uint32_t jobid = job->jobid;
 	uint32_t stepid = job->stepid;
 	uid_t uid = job->uid;
-	uid_t gid = job->gid;
+	gid_t gid = job->gid;
 	pid_t pid;
-	uint64_t ml,mlb,mls;
+
+	char* slurm_cgpath ;
+
+	/* create slurm root cg in this cg namespace */
+	slurm_cgpath = task_cgroup_create_slurm_cg(&memory_ns);
+	if ( slurm_cgpath == NULL ) {
+		return SLURM_ERROR;
+	}
 
 	/* build user cgroup relative path if not set (should not be) */
 	if (*user_cgroup_path == '\0') {
-		if (snprintf(user_cgroup_path,PATH_MAX,
-			      "/uid_%u",uid) >= PATH_MAX) {
-			error("task/cgroup: unable to build uid %u memory "
-			      "cg relative path : %m",uid);
+		if (snprintf(user_cgroup_path, PATH_MAX,
+			     "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) {
+			error("unable to build uid %u cgroup relative "
+			      "path : %m", uid);
+			xfree(slurm_cgpath);
 			return SLURM_ERROR;
 		}
 	}
+	xfree(slurm_cgpath);
 
 	/* build job cgroup relative path if no set (should not be) */
 	if (*job_cgroup_path == '\0') {
@@ -265,58 +376,22 @@ extern int task_cgroup_memory_create(slurmd_job_t *job)
 	 * container in order to guarantee that a job will stay on track
 	 * regardless of the consumption of each step.
 	 */
-	ml = (uint64_t) job->job_mem;
-	ml = ml * 1024 * 1024 ;
-	mlb = (uint64_t) (ml * (allowed_ram_space / 100.0)) ;
-	mls = (uint64_t) mlb + (ml * (allowed_swap_space / 100.0)) ;
-	if (xcgroup_create(&memory_ns,&job_memory_cg,
-			    job_cgroup_path,
-			    getuid(),getgid()) != XCGROUP_SUCCESS) {
-		xcgroup_destroy(&user_memory_cg);
-		goto error;
-	}
-	if (xcgroup_instanciate(&job_memory_cg) != XCGROUP_SUCCESS) {
-		xcgroup_destroy(&user_memory_cg);
-		xcgroup_destroy(&job_memory_cg);
+	if (memcg_initialize (&memory_ns, &job_memory_cg, job_cgroup_path,
+	                      job->job_mem, getuid(), getgid()) < 0) {
+		xcgroup_destroy (&user_memory_cg);
 		goto error;
 	}
-	xcgroup_set_param(&job_memory_cg,"memory.use_hierarchy","1");
-	xcgroup_set_uint64_param(&job_memory_cg,
-				 "memory.limit_in_bytes",mlb);
-	xcgroup_set_uint64_param(&job_memory_cg,
-				 "memory.memsw.limit_in_bytes",mls);
-	debug("task/cgroup: job mem.limit=%"PRIu64"MB memsw.limit=%"PRIu64"MB",
-	      mlb/(1024*1024),mls/(1024*1024));
 
 	/*
 	 * Create step cgroup in the memory ns (it should not exists)
 	 * and set the associated memory limits.
 	 */
-	ml = (uint64_t) job->step_mem;
-	ml = ml * 1024 * 1024 ;
-	mlb = (uint64_t) (ml * (allowed_ram_space / 100.0)) ;
-	mls = (uint64_t) mlb + (ml * (allowed_swap_space / 100.0)) ;
-	if (xcgroup_create(&memory_ns,&step_memory_cg,
-			    jobstep_cgroup_path,
-			    uid,gid) != XCGROUP_SUCCESS) {
-		/* do not delete user/job cgroup as */
-		/* they can exist for other steps */
-		xcgroup_destroy(&user_memory_cg);
-		xcgroup_destroy(&job_memory_cg);
-		goto error;
-	}
-	if (xcgroup_instanciate(&step_memory_cg) != XCGROUP_SUCCESS) {
+	if (memcg_initialize (&memory_ns, &step_memory_cg, jobstep_cgroup_path,
+	                      job->step_mem, uid, gid) < 0) {
 		xcgroup_destroy(&user_memory_cg);
 		xcgroup_destroy(&job_memory_cg);
-		xcgroup_destroy(&step_memory_cg);
 		goto error;
 	}
-	xcgroup_set_uint64_param(&step_memory_cg,
-				 "memory.limit_in_bytes",mlb);
-	xcgroup_set_uint64_param(&step_memory_cg,
-				 "memory.memsw.limit_in_bytes",mls);
-	debug("task/cgroup: step mem.limit=%"PRIu64"MB memsw.limit=%"PRIu64"MB",
-	      mlb/(1024*1024),mls/(1024*1024));
 
 	/*
 	 * Attach the slurmstepd to the step memory cgroup
diff --git a/src/slurmd/common/slurmstepd_init.c b/src/slurmd/common/slurmstepd_init.c
index 4aa8debc82459cf51f0ae620e61ad20624205df1..e33b7b28ca43439fe893eb4ca007b7c7bb2a5383 100644
--- a/src/slurmd/common/slurmstepd_init.c
+++ b/src/slurmd/common/slurmstepd_init.c
@@ -45,6 +45,7 @@ extern void pack_slurmd_conf_lite(slurmd_conf_t *conf, Buf buffer)
 	pack16(conf->sockets, buffer);
 	pack16(conf->cores, buffer);
 	pack16(conf->threads, buffer);
+	pack32(conf->real_memory_size, buffer);
 	packstr(conf->spooldir, buffer);
 	packstr(conf->node_name, buffer);
 	packstr(conf->logfile, buffer);
@@ -70,6 +71,7 @@ extern int unpack_slurmd_conf_lite_no_alloc(slurmd_conf_t *conf, Buf buffer)
 	safe_unpack16(&conf->sockets, buffer);
 	safe_unpack16(&conf->cores, buffer);
 	safe_unpack16(&conf->threads, buffer);
+	safe_unpack32(&conf->real_memory_size, buffer);
 	safe_unpackstr_xmalloc(&conf->spooldir,    &uint32_tmp, buffer);
 	safe_unpackstr_xmalloc(&conf->node_name,   &uint32_tmp, buffer);
 	safe_unpackstr_xmalloc(&conf->logfile,     &uint32_tmp, buffer);
diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c
index 9bb8271bfbb430248915d17454083a0b9d636089..55e9d549872036f9d5c13ae1449e007662da96a9 100644
--- a/src/slurmd/slurmstepd/mgr.c
+++ b/src/slurmd/slurmstepd/mgr.c
@@ -75,6 +75,13 @@
 #  include <stdlib.h>
 #endif
 
+#ifdef HAVE_PTY_H
+#  include <pty.h>
+#  ifdef HAVE_UTMP_H
+#    include <utmp.h>
+#  endif
+#endif
+
 #include "slurm/slurm_errno.h"
 
 #include "src/common/cbuf.h"
@@ -1088,6 +1095,110 @@ _spank_task_privileged(slurmd_job_t *job, int taskid, struct priv_state *sp)
 	return(_drop_privileges (job, true, sp));
 }
 
+struct exec_wait_info {
+	int id;
+	pid_t pid;
+	int parentfd;
+	int childfd;
+};
+
+static struct exec_wait_info * exec_wait_info_create (int i)
+{
+	int fdpair[2];
+	struct exec_wait_info * e;
+
+	if (pipe (fdpair) < 0) {
+		error ("exec_wait_info_create: pipe: %m");
+		return NULL;
+	}
+
+	fd_set_close_on_exec(fdpair[0]);
+	fd_set_close_on_exec(fdpair[1]);
+
+	e = xmalloc (sizeof (*e));
+	e->childfd = fdpair[0];
+	e->parentfd = fdpair[1];
+	e->id = i;
+	e->pid = -1;
+
+	return (e);
+}
+
+static void exec_wait_info_destroy (struct exec_wait_info *e)
+{
+	if (e == NULL)
+		return;
+
+	close (e->parentfd);
+	close (e->childfd);
+	e->id = -1;
+	e->pid = -1;
+}
+
+static pid_t exec_wait_get_pid (struct exec_wait_info *e)
+{
+	if (e == NULL)
+		return (-1);
+	return (e->pid);
+}
+
+static struct exec_wait_info * fork_child_with_wait_info (int id)
+{
+	struct exec_wait_info *e;
+
+	if (!(e = exec_wait_info_create (id)))
+		return (NULL);
+
+	if ((e->pid = fork ()) < 0) {
+		exec_wait_info_destroy (e);
+		return (NULL);
+	}
+	else if (e->pid == 0)  /* In child, close parent fd */
+		close (e->parentfd);
+
+	return (e);
+}
+
+static int exec_wait_child_wait_for_parent (struct exec_wait_info *e)
+{
+	char c;
+
+	if (read (e->childfd, &c, sizeof (c)) != 1)
+		return error ("wait_for_parent: failed: %m");
+
+	return (0);
+}
+
+static int exec_wait_signal_child (struct exec_wait_info *e)
+{
+	char c = '\0';
+
+	if (write (e->parentfd, &c, sizeof (c)) != 1)
+		return error ("write to unblock task %d failed: %m", e->id);
+
+	return (0);
+}
+
+static int exec_wait_signal (struct exec_wait_info *e, slurmd_job_t *job)
+{
+	debug3 ("Unblocking %u.%u task %d, writefd = %d",
+	        job->jobid, job->stepid, e->id, e->parentfd);
+	exec_wait_signal_child (e);
+	return (0);
+}
+
+static void prepare_tty (slurmd_job_t *job, slurmd_task_info_t *task)
+{
+#ifdef HAVE_PTY_H
+	if (job->pty && (task->gtid == 0)) {
+		if (login_tty(task->stdin_fd))
+			error("login_tty: %m");
+		else
+			debug3("login_tty good");
+	}
+#endif
+	return;
+}
 
 /* fork and exec N tasks
  */
@@ -1096,12 +1207,10 @@ _fork_all_tasks(slurmd_job_t *job)
 {
 	int rc = SLURM_SUCCESS;
 	int i;
-	int *writefds; /* array of write file descriptors */
-	int *readfds; /* array of read file descriptors */
-	int fdpair[2];
 	struct priv_state sprivs;
 	jobacct_id_t jobacct_id;
 	char *oom_value;
+	List exec_wait_list = NULL;
 
 	xassert(job != NULL);
 
@@ -1118,36 +1227,6 @@ _fork_all_tasks(slurmd_job_t *job)
 	}
 	debug2("After call to spank_init()");
 
-	/*
-	 * Pre-allocate a pipe for each of the tasks
-	 */
-	debug3("num tasks on this node = %d", job->node_tasks);
-	writefds = (int *) xmalloc (job->node_tasks * sizeof(int));
-	if (!writefds) {
-		error("writefds xmalloc failed!");
-		return SLURM_ERROR;
-	}
-	readfds = (int *) xmalloc (job->node_tasks * sizeof(int));
-	if (!readfds) {
-		error("readfds xmalloc failed!");
-		return SLURM_ERROR;
-	}
-
-
-	for (i = 0; i < job->node_tasks; i++) {
-		fdpair[0] = -1; fdpair[1] = -1;
-		if (pipe (fdpair) < 0) {
-			error ("exec_all_tasks: pipe: %m");
-			return SLURM_ERROR;
-		}
-		debug3("New fdpair[0] = %d, fdpair[1] = %d",
-		       fdpair[0], fdpair[1]);
-		fd_set_close_on_exec(fdpair[0]);
-		fd_set_close_on_exec(fdpair[1]);
-		readfds[i] = fdpair[0];
-		writefds[i] = fdpair[1];
-	}
-
 	set_oom_adj(0);	/* the tasks may be killed by OOM */
 	if (pre_setuid(job)) {
 		error("Failed task affinity setup");
@@ -1185,27 +1264,33 @@ _fork_all_tasks(slurmd_job_t *job)
 		return SLURM_ERROR;
 	}
 
+	exec_wait_list = list_create ((ListDelF) exec_wait_info_destroy);
+	if (!exec_wait_list)
+		return error ("Unable to create exec_wait_list");
+
 	/*
 	 * Fork all of the task processes.
 	 */
 	for (i = 0; i < job->node_tasks; i++) {
 		char time_stamp[256];
 		pid_t pid;
-		if ((pid = fork ()) < 0) {
+		struct exec_wait_info *ei;
+
+		if ((ei = fork_child_with_wait_info (i)) == NULL) {
 			error("child fork: %m");
 			goto fail2;
-		} else if (pid == 0)  { /* child */
-			int j;
+		} else if ((pid = exec_wait_get_pid (ei)) == 0)  { /* child */
+			/*
+			 *  Destroy exec_wait_list in the child.
+			 *   Only exec_wait_info for previous tasks have been
+			 *   added to the list so far, so everything else
+			 *   can be discarded.
+			 */
+			list_destroy (exec_wait_list);
 
 #ifdef HAVE_AIX
 			(void) mkcrid(0);
 #endif
-			/* Close file descriptors not needed by the child */
-			for (j = 0; j < job->node_tasks; j++) {
-				close(writefds[j]);
-				if (j > i)
-					close(readfds[j]);
-			}
 			/* jobacct_gather_g_endpoll();
 			 * closing jobacct files here causes deadlock */
 
@@ -1229,14 +1314,28 @@ _fork_all_tasks(slurmd_job_t *job)
 
 			xsignal_unblock(slurmstepd_blocked_signals);
 
-			exec_task(job, i, readfds[i]);
+			/*
+			 *  Setup tty before any setpgid() calls
+			 */
+			prepare_tty (job, job->task[i]);
+
+			/*
+			 *  Block until parent notifies us that it is ok to
+			 *   proceed. This allows the parent to place all
+			 *   children in any process groups or containers
+			 *   before they make a call to exec(2).
+			 */
+			exec_wait_child_wait_for_parent (ei);
+
+			exec_task(job, i);
 		}
 
 		/*
 		 * Parent continues:
 		 */
 
-		close(readfds[i]);
+		list_append (exec_wait_list, ei);
+
 		LOG_TIMESTAMP(time_stamp);
 		verbose ("task %lu (%lu) started %s",
 			(unsigned long) job->task[i]->gtid,
@@ -1306,16 +1405,10 @@ _fork_all_tasks(slurmd_job_t *job)
 	/*
 	 * Now it's ok to unblock the tasks, so they may call exec.
 	 */
-	for (i = 0; i < job->node_tasks; i++) {
-		char c = '\0';
-
-		debug3("Unblocking %u.%u task %d, writefd = %d",
-		       job->jobid, job->stepid, i, writefds[i]);
-		if (write (writefds[i], &c, sizeof (c)) != 1)
-			error ("write to unblock task %d failed", i);
-
-		close(writefds[i]);
+	list_for_each (exec_wait_list, (ListForF) exec_wait_signal, job);
+	list_destroy (exec_wait_list);
 
+	for (i = 0; i < job->node_tasks; i++) {
 		/*
 		 * Prepare process for attach by parallel debugger
 		 * (if specified and able)
@@ -1324,17 +1417,14 @@ _fork_all_tasks(slurmd_job_t *job)
 				== SLURM_ERROR)
 			rc = SLURM_ERROR;
 	}
-	xfree(writefds);
-	xfree(readfds);
 
 	return rc;
 
 fail2:
 	_reclaim_privileges (&sprivs);
+	if (exec_wait_list)
+		list_destroy (exec_wait_list);
 fail1:
-	xfree(writefds);
-	xfree(readfds);
-
 	pam_finish();
 	return SLURM_ERROR;
 }
@@ -2124,6 +2214,7 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job,
 {
 	int status, rc, opt;
 	pid_t cpid;
+	struct exec_wait_info *ei;
 
 	xassert(env);
 	if (path == NULL || path[0] == '\0')
@@ -2140,11 +2231,11 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job,
 	    (slurm_container_create(job) != SLURM_SUCCESS))
 		error("slurm_container_create: %m");
 
-	if ((cpid = fork()) < 0) {
+	if ((ei = fork_child_with_wait_info(0)) == NULL) {
 		error ("executing %s: fork: %m", name);
 		return -1;
 	}
-	if (cpid == 0) {
+	if ((cpid = exec_wait_get_pid (ei)) == 0) {
 		struct priv_state sprivs;
 		char *argv[2];
 
@@ -2171,6 +2262,11 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job,
 #else
 		setpgrp();
 #endif
+		/*
+		 *  Wait for signal from parent
+		 */
+		exec_wait_child_wait_for_parent (ei);
+
 		execve(path, argv, env);
 		error("execve(): %m");
 		exit(127);
@@ -2178,6 +2274,11 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job,
 
 	if (slurm_container_add(job, cpid) != SLURM_SUCCESS)
 		error("slurm_container_add: %m");
+
+	if (exec_wait_signal_child (ei) < 0)
+		error ("run_script_as_user: Failed to wakeup %s", name);
+	exec_wait_info_destroy (ei);
+
 	if (max_wait < 0)
 		opt = 0;
 	else
diff --git a/src/slurmd/slurmstepd/task.c b/src/slurmd/slurmstepd/task.c
index 4462596b35f0c3d46b613639d2b48d35ddd48c92..925e67e4ab0bc037bff252f05d5d268f8df3aaba 100644
--- a/src/slurmd/slurmstepd/task.c
+++ b/src/slurmd/slurmstepd/task.c
@@ -65,14 +65,6 @@
 #  include <sys/checkpnt.h>
 #endif
 
-#ifdef HAVE_PTY_H
-#  include <pty.h>
-#endif
-
-#ifdef HAVE_UTMP_H
-#  include <utmp.h>
-#endif
-
 #include <sys/resource.h>
 
 #include "slurm/slurm_errno.h"
@@ -337,37 +329,15 @@ _setup_mpi(slurmd_job_t *job, int ltaskid)
  *  Current process is running as the user when this is called.
  */
 void
-exec_task(slurmd_job_t *job, int i, int waitfd)
+exec_task(slurmd_job_t *job, int i)
 {
-	char c;
 	uint32_t *gtids;		/* pointer to arrary of ranks */
 	int fd, j;
-	int rc;
 	slurmd_task_info_t *task = job->task[i];
 
-#ifdef HAVE_PTY_H
-	/* Execute login_tty() before setpgid() calls */
-	if (job->pty && (task->gtid == 0)) {
-		if (login_tty(task->stdin_fd))
-			error("login_tty: %m");
-		else
-			debug3("login_tty good");
-	}
-#endif
-
 	if (i == 0)
 		_make_tmpdir(job);
 
-	/*
-	 * Stall exec until all tasks have joined the same process group
-	 */
-	if ((rc = read (waitfd, &c, sizeof (c))) != 1) {
-		error ("_exec_task read failed, fd = %d, rc=%d: %m", waitfd, rc);
-		log_fini();
-		exit(1);
-	}
-	close(waitfd);
-
 	gtids = xmalloc(job->node_tasks * sizeof(uint32_t));
 	for (j = 0; j < job->node_tasks; j++)
 		gtids[j] = job->task[j]->gtid;
@@ -422,14 +392,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
 		}
 	}
 
-#ifdef HAVE_PTY_H
-	if (job->pty && (task->gtid == 0)) {
-		/* Need to perform the login_tty() before all tasks
-		 * register and the process groups are reset, otherwise
-		 * login_tty() gets disabled */
-	} else
-#endif
-		io_dup_stdio(task);
+	io_dup_stdio(task);
 
 	/* task-specific pre-launch activities */
 
diff --git a/src/slurmd/slurmstepd/task.h b/src/slurmd/slurmstepd/task.h
index d067df52dca3f090966a1073dafae42748fc2421..78c0b6058cdc402411bff87e710adba65f9538ea 100644
--- a/src/slurmd/slurmstepd/task.h
+++ b/src/slurmd/slurmstepd/task.h
@@ -52,6 +52,6 @@
 
 #include "src/slurmd/slurmstepd/slurmstepd_job.h"
 
-void exec_task(slurmd_job_t *job, int i, int waitfd);
+void exec_task(slurmd_job_t *job, int i);
 
 #endif /* !_TASK_H */