diff --git a/src/plugins/task/cgroup/task_cgroup_cpuset.c b/src/plugins/task/cgroup/task_cgroup_cpuset.c
index ad93004f68c48a243af0ad0703d9c97d4e484231..15f5956efc335b5d371fb51c9cf94076ff1e35ed 100644
--- a/src/plugins/task/cgroup/task_cgroup_cpuset.c
+++ b/src/plugins/task/cgroup/task_cgroup_cpuset.c
@@ -3,8 +3,8 @@
  *****************************************************************************
  *  Copyright (C) 2009 CEA/DAM/DIF
  *  Written by Matthieu Hautreux <matthieu.hautreux@cea.fr>
- *  Portions copyright (C) 2012 Bull
- *  Written by Martin Perry <martin.perry@bull.com>
+ *  Portions copyright (C) 2012,2015 Bull/Atos
+ *  Written by Martin Perry <martin.perry@atos.net>
  *
  *  This file is part of SLURM, a resource management program.
  *  For details, see <http://slurm.schedmd.com/>.
@@ -108,6 +108,9 @@ static inline int hwloc_bitmap_isequal(
 
 # endif
 
+hwloc_obj_type_t obj_types[3] = {HWLOC_OBJ_SOCKET, HWLOC_OBJ_CORE,
+			         HWLOC_OBJ_PU};
+
 static uint16_t bind_mode = CPU_BIND_NONE   | CPU_BIND_MASK   |
 			    CPU_BIND_RANK   | CPU_BIND_MAP    |
 			    CPU_BIND_LDMASK | CPU_BIND_LDRANK |
@@ -639,25 +642,49 @@ static int _task_cgroup_cpuset_dist_cyclic(
 	hwloc_bitmap_t cpuset)
 {
 	hwloc_obj_t obj;
-	uint32_t *obj_idx;
-	uint32_t i, j, sock_idx, sock_loop, ntskip, npdist;
+	uint32_t  s_ix;		/* socket index */
+	uint32_t *c_ixc;	/* core index by socket (current taskid) */
+	uint32_t *c_ixn;	/* core index by socket (next taskid) */
+	uint32_t *t_ix;		/* thread index by core by socket */
 	uint32_t npus, ncores, nsockets;
 	uint32_t taskid = job->envtp->localid;
 	int spec_thread_cnt = 0;
 	bitstr_t *spec_threads = NULL;
 
+	uint32_t obj_idxs[3], nthreads, cps,
+		 tpc, i, j, sock_loop, ntskip, npdist;;
+	bool core_cyclic, core_fcyclic, sock_fcyclic, core_block;
+
+	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
+						       HWLOC_OBJ_SOCKET);
+	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
+						       HWLOC_OBJ_CORE);
+	nthreads = (uint32_t) hwloc_get_nbobjs_by_type(topology,
+						       HWLOC_OBJ_PU);
+	cps = ncores/nsockets;
+	tpc = nthreads/ncores;
+
+	sock_fcyclic = (job->task_dist & SLURM_DIST_SOCKMASK) ==
+		SLURM_DIST_SOCKCFULL ? true : false;
+	core_block = (job->task_dist & SLURM_DIST_COREMASK) ==
+		SLURM_DIST_COREBLOCK ? true : false;
+	core_cyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
+		SLURM_DIST_CORECYCLIC ? true : false;
+	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
+		SLURM_DIST_CORECFULL ? true : false;
+
 	if (bind_verbose) {
 		info("task/cgroup: task[%u] using %s distribution "
-		     "(task_dist=%u)", taskid,
+		     "(task_dist=0x%x)", taskid,
 		     format_task_dist_states(job->task_dist), job->task_dist);
 	}
-	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
-						       HWLOC_OBJ_SOCKET);
-	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
-						     HWLOC_OBJ_CORE);
+
 	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
 						   HWLOC_OBJ_PU);
-	obj_idx = xmalloc(nsockets * sizeof(uint32_t));
+
+	t_ix = xmalloc(ncores * sizeof(uint32_t));
+	c_ixc = xmalloc(nsockets * sizeof(uint32_t));
+	c_ixn = xmalloc(nsockets * sizeof(uint32_t));
 
 	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
 		/* cores or threads granularity */
@@ -702,50 +729,83 @@ static int _task_cgroup_cpuset_dist_cyclic(
 	   current task cpuset. To prevent infinite loop, check
 	   that we do not loop more than npdist times around the available
 	   sockets, which is the worst scenario we should afford here. */
-	i = 0;
-	j = 0;
-	sock_idx = 0;
-	sock_loop = 0;
-	while (i < ntskip + 1 && sock_loop < npdist + 1) {
+	i = j = s_ix = sock_loop = 0;
+	while (i < ntskip + 1 && (sock_loop/tpc) < npdist + 1) {
 		/* fill one or multiple sockets using block mode, unless
 		   otherwise stated in the job->task_dist field */
-		while ((sock_idx < nsockets) && (j < npdist)) {
+		while ((s_ix < nsockets) && (j < npdist)) {
+			if (c_ixc[s_ix] == cps)
+				c_ixc[s_ix] = 0;
 			obj = hwloc_get_obj_below_by_type(
-				topology, HWLOC_OBJ_SOCKET, sock_idx,
-				hwtype, obj_idx[sock_idx]);
+				topology, HWLOC_OBJ_SOCKET, s_ix,
+				hwtype, c_ixc[s_ix]);
 			if (obj != NULL) {
-				obj_idx[sock_idx]++;
-				j++;
-				if (i == ntskip) {
-					_add_hwloc_cpuset(hwtype, req_hwtype,
-							  obj, taskid,
-							  bind_verbose, cpuset);
+				if (hwloc_compare_types(hwtype, HWLOC_OBJ_PU)
+									>= 0) {
+					/* granularity is thread */
+					obj_idxs[0]=s_ix;
+					obj_idxs[1]=c_ixc[s_ix];
+					obj_idxs[2]=t_ix[(s_ix*cps)+c_ixc[s_ix]];
+					obj = hwloc_get_obj_below_array_by_type(
+						topology, 3, obj_types, obj_idxs);
+					if (obj != NULL) {
+						t_ix[(s_ix*cps)+c_ixc[s_ix]]++;
+						j++;
+						if (i == ntskip)
+							_add_hwloc_cpuset(hwtype,
+							req_hwtype, obj, taskid,
+							bind_verbose, cpuset);
+						if (j < npdist) {
+							if (core_cyclic) {
+								c_ixn[s_ix] =
+								c_ixc[s_ix] + 1;
+							} else if (core_fcyclic){
+								c_ixc[s_ix]++;
+								c_ixn[s_ix] =
+								c_ixc[s_ix];
+							}
+							if (sock_fcyclic)
+								s_ix++;
+						}
+					} else {
+						c_ixc[s_ix]++;
+						if (c_ixc[s_ix] == cps)
+							s_ix++;
+					}
+				} else {
+					/* granularity is core or larger */
+					c_ixc[s_ix]++;
+					j++;
+					if (i == ntskip)
+						_add_hwloc_cpuset(hwtype,
+							req_hwtype, obj, taskid,
+						  	bind_verbose, cpuset);
+					if ((j < npdist) && (sock_fcyclic))
+						s_ix++;
 				}
-				if ((j < npdist) &&
-				    (((job->task_dist & SLURM_DIST_STATE_BASE) ==
-				      SLURM_DIST_CYCLIC_CFULL) ||
-				     ((job->task_dist & SLURM_DIST_STATE_BASE) ==
-				      SLURM_DIST_BLOCK_CFULL)))
-					sock_idx++;
-			} else {
-				sock_idx++;
-			}
+			} else
+				s_ix++;
 		}
-		/* if it succeed, switch to the next task, starting
-		   with the next available socket, otherwise, loop back
-		   from the first socket trying to find available slots. */
+		/* if it succeeds, switch to the next task, starting
+		 * with the next available socket, otherwise, loop back
+		 * from the first socket trying to find available slots. */
 		if (j == npdist) {
 			i++;
 			j = 0;
-			sock_idx++; // no validity check, handled by the while
+			if (!core_block)
+				c_ixn[s_ix] = c_ixc[s_ix] + 1;
+			memcpy(c_ixc, c_ixn, nsockets * sizeof(uint32_t));
+			s_ix++; // no validity check, handled by the while
 			sock_loop = 0;
 		} else {
 			sock_loop++;
-			sock_idx = 0;
+			s_ix = 0;
 		}
 	}
+	xfree(t_ix);
+	xfree(c_ixc);
+	xfree(c_ixn);
 
-	xfree(obj_idx);
 	if (spec_threads) {
 		for (i = 0; i < npus; i++) {
 			if (bit_test(spec_threads, i)) {
@@ -755,7 +815,7 @@ static int _task_cgroup_cpuset_dist_cyclic(
 		FREE_NULL_BITMAP(spec_threads);
 	}
 
-	/* should never happened in normal scenario */
+	/* should never happen in normal scenario */
 	if (sock_loop > npdist) {
 		error("task/cgroup: task[%u] infinite loop broken while trying "
 		      "to provision compute elements using %s", taskid,
@@ -771,17 +831,18 @@ static int _task_cgroup_cpuset_dist_block(
 	stepd_step_rec_t *job, int bind_verbose, hwloc_bitmap_t cpuset)
 {
 	hwloc_obj_t obj;
-	uint32_t i, pfirst, plast;
+	uint32_t core_loop, ntskip, npdist;
+	uint32_t i, j, pfirst, plast;
 	uint32_t taskid = job->envtp->localid;
 	int hwdepth;
 	uint32_t npus, ncores, nsockets;
 	int spec_thread_cnt = 0;
 	bitstr_t *spec_threads = NULL;
 
-	if (bind_verbose) {
-		info("task/cgroup: task[%u] using block distribution, "
-		     "task_dist %u", taskid, job->task_dist);
-	}
+	uint32_t *thread_idx;
+	uint32_t core_idx;
+	bool core_fcyclic, core_block;
+
 	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
 						       HWLOC_OBJ_SOCKET);
 	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
@@ -789,6 +850,65 @@ static int _task_cgroup_cpuset_dist_block(
 	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
 						   HWLOC_OBJ_PU);
 
+	core_block = (job->task_dist & SLURM_DIST_COREMASK) ==
+		SLURM_DIST_COREBLOCK ? true : false;
+	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
+		SLURM_DIST_CORECFULL ? true : false;
+
+	thread_idx = xmalloc(ncores * sizeof(uint32_t));
+
+	if (bind_verbose) {
+		info("task/cgroup: task[%u] using block distribution, "
+		     "task_dist 0x%x", taskid, job->task_dist);
+	}
+
+	if ((hwloc_compare_types(hwtype, HWLOC_OBJ_PU) == 0) && !core_block) {
+		thread_idx = xmalloc(ncores * sizeof(uint32_t));
+		ntskip = taskid;
+		npdist = job->cpus_per_task;
+
+		i = 0; j = 0;
+		core_idx = 0;
+		core_loop = 0;
+		while (i < ntskip + 1 && core_loop < npdist + 1) {
+			while ((core_idx < ncores) && (j < npdist)) {
+				obj = hwloc_get_obj_below_by_type(
+					topology, HWLOC_OBJ_CORE, core_idx,
+					hwtype, thread_idx[core_idx]);
+				if (obj != NULL) {
+					thread_idx[core_idx]++;
+					j++;
+					if (i == ntskip)
+						_add_hwloc_cpuset(hwtype,
+							req_hwtype, obj, taskid,
+							bind_verbose, cpuset);
+					if ((j < npdist) && core_fcyclic)
+						core_idx++;
+				} else {
+					core_idx++;
+				}
+			}
+			if (j == npdist) {
+				i++; j = 0;
+				core_idx++; // no validity check, handled by the while
+				core_loop = 0;
+			} else {
+				core_loop++;
+				core_idx = 0;
+			}
+		}
+		xfree(thread_idx);
+
+		/* should never happen in normal scenario */
+		if (core_loop > npdist) {
+			error("task/cgroup: task[%u] infinite loop broken while "
+			      "trying to provision compute elements using %s",
+			      taskid, format_task_dist_states(job->task_dist));
+			return XCGROUP_ERROR;
+		} else
+			return XCGROUP_SUCCESS;
+	}
+
 	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
 		/* cores or threads granularity */
 		pfirst = taskid * job->cpus_per_task ;
@@ -1244,6 +1364,12 @@ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job)
 	uint32_t taskid = job->envtp->localid;
 	uint32_t jntasks = job->node_tasks;
 	uint32_t jnpus;
+
+	/* Allocate and initialize hwloc objects */
+	hwloc_topology_init(&topology);
+	hwloc_topology_load(topology);
+	cpuset = hwloc_bitmap_alloc();
+
 	int spec_threads = 0;
 
 	if (job->batch) {
@@ -1257,11 +1383,6 @@ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job)
 	    (bind_type & CPU_BIND_VERBOSE))
 		bind_verbose = 1 ;
 
-	/* Allocate and initialize hwloc objects */
-	hwloc_topology_init(&topology);
-	hwloc_topology_load(topology);
-	cpuset = hwloc_bitmap_alloc();
-
 	if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
 	     hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) {
 		/* One socket contains multiple NUMA-nodes
@@ -1412,6 +1533,7 @@ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job)
 				error("task/cgroup: task[%u] unable to set "
 				      "mask 0x%s", taskid,
 				      cpuset_to_str(&ts, mstr));
+				error("sched_setaffinity rc = %d", rc);
 				fstatus = SLURM_ERROR;
 			} else if (bind_verbose) {
 				info("task/cgroup: task[%u] mask 0x%s",
@@ -1431,48 +1553,20 @@ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job)
 			     job->task_dist);
 		}
 
-		/* There are two "distributions,"  controlled by the
-		 * -m option of srun and friends. The first is the
-		 * distribution of tasks to nodes.  The second is the
-		 * distribution of allocated cpus to tasks for
-		 * binding.  This code is handling the second
-		 * distribution.  Here's how the values get set, based
-		 * on the value of -m
-		 *
-		 * SLURM_DIST_CYCLIC = srun -m cyclic
-		 * SLURM_DIST_BLOCK = srun -m block
-		 * SLURM_DIST_CYCLIC_CYCLIC = srun -m cyclic:cyclic
-		 * SLURM_DIST_BLOCK_CYCLIC = srun -m block:cyclic
-		 *
-		 * In the first two cases, the user only specified the
-		 * first distribution.  The second distribution
-		 * defaults to cyclic.  In the second two cases, the
-		 * user explicitly requested a second distribution of
-		 * cyclic.  So all these four cases correspond to a
-		 * second distribution of cyclic.   So we want to call
-		 * _task_cgroup_cpuset_dist_cyclic.
-		 *
-		 * If the user explicitly specifies a second
-		 * distribution of block, or if
-		 * CR_CORE_DEFAULT_DIST_BLOCK is configured and the
-		 * user does not explicitly specify a second
-		 * distribution of cyclic, the second distribution is
-		 * block, and we need to call
-		 * _task_cgroup_cpuset_dist_block. In these cases,
-		 * task_dist would be set to SLURM_DIST_CYCLIC_BLOCK
-		 * or SLURM_DIST_BLOCK_BLOCK.
+		/* See srun man page for detailed information on --distribution
+		 * option.
 		 *
 		 * You can see the equivalent code for the
 		 * task/affinity plugin in
 		 * src/plugins/task/affinity/dist_tasks.c, around line 368
 		 */
-		switch (job->task_dist & SLURM_DIST_STATE_BASE) {
+		switch (job->task_dist & SLURM_DIST_NODESOCKMASK) {
 		case SLURM_DIST_BLOCK_BLOCK:
 		case SLURM_DIST_CYCLIC_BLOCK:
 		case SLURM_DIST_PLANE:
 			/* tasks are distributed in blocks within a plane */
-			_task_cgroup_cpuset_dist_block(
-				topology, hwtype, req_hwtype,
+			_task_cgroup_cpuset_dist_block(topology,
+				hwtype, req_hwtype,
 				nobj, job, bind_verbose, cpuset);
 			break;
 		case SLURM_DIST_ARBITRARY:
@@ -1481,8 +1575,8 @@ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job)
 		case SLURM_DIST_UNKNOWN:
 			if (slurm_get_select_type_param()
 			    & CR_CORE_DEFAULT_DIST_BLOCK) {
-				_task_cgroup_cpuset_dist_block(
-					topology, hwtype, req_hwtype,
+				_task_cgroup_cpuset_dist_block(topology,
+					hwtype, req_hwtype,
 					nobj, job, bind_verbose, cpuset);
 				break;
 			}
@@ -1490,8 +1584,8 @@ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job)
 			   default dist block.
 			*/
 		default:
-			_task_cgroup_cpuset_dist_cyclic(
-				topology, hwtype, req_hwtype,
+			_task_cgroup_cpuset_dist_cyclic(topology,
+				hwtype, req_hwtype,
 				job, bind_verbose, cpuset);
 			break;
 		}