From acccbdfe6bb38d9826f311de226921809d59967e Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Tue, 8 Nov 2005 18:34:46 +0000
Subject: [PATCH] hostfile support addition

---
 NEWS                                       |   2 +
 slurm/slurm.h.in                           |  11 +-
 src/api/allocate.c                         | 112 +++++-
 src/api/spawn.c                            | 259 +++++++++----
 src/common/dist_tasks.c                    |   3 -
 src/common/dist_tasks.h                    |  10 +-
 src/plugins/switch/federation/federation.c |   5 +-
 src/slurmctld/node_scheduler.c             |  30 +-
 src/slurmctld/proc_req.c                   |   6 +-
 src/slurmctld/step_mgr.c                   |  22 +-
 src/srun/allocate.c                        | 148 ++++----
 src/srun/allocate.h                        |   2 +-
 src/srun/launch.c                          |  44 ++-
 src/srun/msg.c                             |  24 +-
 src/srun/opt.c                             |  24 +-
 src/srun/opt.h                             |   7 +-
 src/srun/reattach.c                        |   8 +-
 src/srun/srun.c                            |  37 +-
 src/srun/srun_job.c                        | 416 +++++++++++++--------
 src/srun/srun_job.h                        |  28 +-
 testsuite/expect/test1.47                  |   6 +-
 testsuite/expect/test1.51                  |   2 +-
 testsuite/expect/test1.81                  |  31 +-
 testsuite/expect/test9.8                   |   6 +
 24 files changed, 839 insertions(+), 404 deletions(-)

diff --git a/NEWS b/NEWS
index 3c258bc0f30..c281d6c80a0 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,8 @@ documents those changes that are of interest to users and admins.
 =============================
  -- Remove BNR libary functions and add those for PMI (not fully 
     implemented yet)
+ -- Added Hostfile support for POE and srun.  MP_HOSTFILE env var to set
+    location of hostfile.  Tasks will run from list order in the file.  
 
 * Changes in SLURM 0.7.0-pre3
 =============================
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index c8fef5607b3..ea1d04aca1c 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -200,7 +200,9 @@ enum select_print_mode {
 /* Possible task distributions across the nodes */
 enum task_dist_states {
 	SLURM_DIST_CYCLIC,	/* distribute tasks 1 per node, round robin */
-	SLURM_DIST_BLOCK	/* distribute tasks filling node by node */
+	SLURM_DIST_BLOCK,	/* distribute tasks filling node by node */
+	SLURM_DIST_HOSTFILE,	/* distribute tasks from what hostfile says */
+	SLURM_DIST_UNKNOWN	/* unknown dist */
 };
 
 /* The last entry in node_states must be STATE_END, keep in sync with 
@@ -243,7 +245,12 @@ enum ctx_keys {
 	SLURM_STEP_CTX_STEPID,	/* get the created job step id */
 	SLURM_STEP_CTX_TASKS,	/* get array of task count on each node */
 	SLURM_STEP_CTX_TID,	/* get array of task IDs for specified node */
-	SLURM_STEP_CTX_RESP	/* get job step create response message */
+	SLURM_STEP_CTX_RESP,	/* get job step create response message */
+	SLURM_STEP_CTX_CRED,
+	SLURM_STEP_CTX_SWITCH_JOB,
+	SLURM_STEP_CTX_NHOSTS,
+	SLURM_STEP_CTX_CPUS,
+	SLURM_STEP_CTX_HOST
 };
 
 /*****************************************************************************\
diff --git a/src/api/allocate.c b/src/api/allocate.c
index 6224b78cf23..b11b911e29c 100644
--- a/src/api/allocate.c
+++ b/src/api/allocate.c
@@ -42,8 +42,13 @@ extern pid_t getsid(pid_t pid);		/* missing from <unistd.h> */
 
 #include "src/common/read_config.h"
 #include "src/common/slurm_protocol_api.h"
+#include "src/common/hostlist.h"
+#include "src/common/xmalloc.h"
+
+#define BUF_SIZE 1024
 
 static int _handle_rc_msg(slurm_msg_t *msg);
+static int _nodelist_from_hostfile(job_step_create_request_msg_t *req);
 
 /*
  * slurm_allocate_resources - allocate resources for a job request
@@ -61,7 +66,6 @@ slurm_allocate_resources (job_desc_msg_t *req,
 	slurm_msg_t resp_msg;
 	bool host_set = false;
 	char host[64];
-
 	/* 
 	 * set Node and session id for this request
 	 */
@@ -205,6 +209,9 @@ slurm_job_step_create (job_step_create_request_msg_t *req,
 
 	req_msg.msg_type = REQUEST_JOB_STEP_CREATE;
 	req_msg.data     = req; 
+	
+	if(_nodelist_from_hostfile(req) == 0) 
+		debug("nodelist was NULL");  
 
 	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
 		return SLURM_ERROR;
@@ -282,3 +289,106 @@ _handle_rc_msg(slurm_msg_t *msg)
 	else
 		return SLURM_SUCCESS;
 }
+
+static int _nodelist_from_hostfile(job_step_create_request_msg_t *req)
+{
+	char *hostfile = NULL;
+	char *hostname = NULL;
+	FILE *hostfilep = NULL;
+	char in_line[BUF_SIZE];	/* input line */
+	int i, j;
+	int line_size;
+	hostlist_t hostlist = NULL;
+	int count;
+	int len = 0;
+	int ret = 0;
+	int line_num = 0;
+	char *nodelist = NULL;
+	
+	if (hostfile = (char *)getenv("MP_HOSTFILE")) {
+		if(strlen(hostfile)<1)
+			goto no_hostfile;
+		if((hostfilep = fopen(hostfile, "r")) == NULL) {
+			error("slurm_allocate_resources "
+			      "error opening file %s, %m", 
+			      hostfile);
+			goto no_hostfile;
+		}
+		hostlist = hostlist_create(NULL);
+		
+		while (fgets (in_line, BUF_SIZE, hostfilep) != NULL) {
+			line_num++;
+			line_size = strlen(in_line);
+			if (line_size >= (BUF_SIZE - 1)) {
+				error ("Line %d, of hostfile %s too long",
+				       line_num, hostfile);
+				fclose (hostfilep);
+				goto no_hostfile;
+			}
+			for (i = 0; i < line_size; i++) {
+				if (in_line[i] == '\n') {
+					in_line[i] = '\0';
+					break;
+				}
+				if (in_line[i] == '\0')
+					break;
+				if (in_line[i] != '#')
+					continue;
+				if ((i > 0) && (in_line[i - 1] == '\\')) {
+					for (j = i; j < line_size; j++) {
+						in_line[j - 1] = in_line[j];
+					}
+					line_size--;
+					continue;
+				}	
+				in_line[i] = '\0';
+				break;
+			}
+			
+			len += strlen(in_line)+1;
+			hostlist_push(hostlist,in_line);	
+			if(req->num_tasks && (line_num+1)>req->num_tasks) 
+  				break; 
+		}
+		fclose (hostfilep);
+		
+		nodelist = (char *)xmalloc(sizeof(char)*len);
+		memset(nodelist, 0, len);
+
+		count = hostlist_count(hostlist);
+		if (count <= 0) {
+			error("Hostlist is empty!\n");
+			xfree(*nodelist);
+			goto cleanup_hostfile;
+		}
+		
+		len = 0;
+		while (hostname = hostlist_shift(hostlist)) {
+			line_num = strlen(hostname)+1;
+			ret = sprintf(nodelist+len, 
+				       "%s,", hostname);
+			if (ret < 0 || ret > line_num) {
+				error("bad snprintf only %d printed",ret);
+				xfree(*nodelist);
+				goto cleanup_hostfile;
+			}
+			len += ret;
+		}
+		nodelist[--len] = '\0';
+		debug2("Hostlist from MP_HOSTFILE = %s\n",
+		     nodelist);
+					
+	cleanup_hostfile:
+		hostlist_destroy(hostlist);
+		
+	}
+no_hostfile:
+	if(nodelist) {
+		if(req->node_list)
+			xfree(req->node_list);
+		req->node_list = nodelist;
+		req->num_tasks = count;
+		req->task_dist = SLURM_DIST_HOSTFILE;
+	}
+	return count;
+}
diff --git a/src/api/spawn.c b/src/api/spawn.c
index 9dd4e80e385..0780bfa2930 100644
--- a/src/api/spawn.c
+++ b/src/api/spawn.c
@@ -103,6 +103,7 @@ static void	_free_char_array(char ***argv_p, int cnt);
 static int	_p_launch(slurm_msg_t *req, slurm_step_ctx ctx);
 static int	_sock_bind_wild(int sockfd);
 static int	_task_layout(slurm_step_ctx ctx);
+static int      _task_layout_hostfile(slurm_step_ctx ctx);
 static int	_task_layout_block(slurm_step_ctx ctx);
 static int	_task_layout_cyclic(slurm_step_ctx ctx);
 static void *	_thread_per_node_rpc(void *args);
@@ -123,29 +124,41 @@ slurm_step_ctx_create (job_step_create_request_msg_t *step_req)
 	old_job_alloc_msg_t old_job_req;
 	job_step_create_response_msg_t *step_resp = NULL;
 	resource_allocation_response_msg_t *alloc_resp;
-
+	char *temp = NULL;
 	old_job_req.job_id	= step_req->job_id;
 	old_job_req.uid		= getuid();
 	if (slurm_confirm_allocation(&old_job_req, &alloc_resp) < 0)
 		return NULL;
-
+	
 	if ((slurm_job_step_create(step_req, &step_resp) < 0) ||
 	    (step_resp == NULL)) {
 		slurm_free_resource_allocation_response_msg(alloc_resp);
 		return NULL;	/* slurm errno already set */
 	}
-
+	
+	temp = step_req->node_list;
+	step_req->node_list = step_resp->node_list;
+	step_resp->node_list = temp;
+		
 	rc = xmalloc(sizeof(struct slurm_step_ctx_struct));
 	rc->magic	= STEP_CTX_MAGIC;
 	rc->job_id	= step_req->job_id;
 	rc->user_id	= step_req->user_id;
-	rc->num_tasks	= step_req->num_tasks;
 	rc->task_dist	= step_req->task_dist;
+	rc->num_tasks	= step_req->num_tasks;
 	rc->step_resp	= step_resp;
 	rc->alloc_resp	= alloc_resp;
-
-	rc->hl		= hostlist_create(rc->step_resp->node_list);
-	rc->nhosts	= hostlist_count(rc->hl);
+	rc->hl		= hostlist_create(step_req->node_list);
+
+#ifdef HAVE_FRONT_END	/* Limited job step support */
+	/* All jobs execute through front-end on Blue Gene/L.
+	 * Normally we would not permit execution of job steps,
+	 * but can fake it by just allocating all tasks to
+	 * one of the allocated nodes. */
+	rc->nhosts    = 1;
+#else
+	rc->nhosts = hostlist_count(rc->hl);
+#endif
 	(void) _task_layout(rc);
 
 	return rc;
@@ -162,10 +175,13 @@ slurm_step_ctx_get (slurm_step_ctx ctx, int ctx_key, ...)
 	va_list ap;
 	int rc = SLURM_SUCCESS;
 	uint32_t node_inx;
-	uint32_t *step_id_ptr;
-	uint32_t **array_pptr = (uint32_t **) NULL;
+	uint32_t *uint32_ptr;
+	uint32_t **uint32_array_pptr = (uint32_t **) NULL;
+	char **char_array_pptr = (char **) NULL;
 	job_step_create_response_msg_t ** step_resp_pptr;
-
+	slurm_cred_t  *cred;     /* Slurm job credential    */
+	switch_jobinfo_t *switch_job;
+	
 	if ((ctx == NULL) ||
 	    (ctx->magic != STEP_CTX_MAGIC)) {
 		slurm_seterrno(EINVAL);
@@ -174,35 +190,60 @@ slurm_step_ctx_get (slurm_step_ctx ctx, int ctx_key, ...)
 
 	va_start(ap, ctx_key);
 	switch (ctx_key) {
-		case SLURM_STEP_CTX_STEPID:
-			step_id_ptr = (uint32_t *) va_arg(ap, void *);
-			*step_id_ptr = ctx->step_resp->job_step_id;
-			break;
-		case SLURM_STEP_CTX_TASKS:
-			array_pptr = (uint32_t **) va_arg(ap, void *);
-			*array_pptr = ctx->tasks;
-			break;
-
-		case SLURM_STEP_CTX_TID:
-			node_inx = va_arg(ap, uint32_t);
-			if ((node_inx < 0) || (node_inx > ctx->nhosts)) {
-				slurm_seterrno(EINVAL);
-				rc = SLURM_ERROR;
-				break;
-			}
-			array_pptr = (uint32_t **) va_arg(ap, void *);
-			*array_pptr = ctx->tids[node_inx];
-			break;
-
-		case SLURM_STEP_CTX_RESP:
-			step_resp_pptr = (job_step_create_response_msg_t **) 
-				va_arg(ap, void *);
-			*step_resp_pptr = ctx->step_resp;
+	case SLURM_STEP_CTX_STEPID:
+		uint32_ptr = (uint32_t *) va_arg(ap, void *);
+		*uint32_ptr = ctx->step_resp->job_step_id;
+		break;
+	case SLURM_STEP_CTX_TASKS:
+		uint32_array_pptr = (uint32_t **) va_arg(ap, void *);
+		*uint32_array_pptr = ctx->tasks;
+		break;
+		
+	case SLURM_STEP_CTX_TID:
+		node_inx = va_arg(ap, uint32_t);
+		if ((node_inx < 0) || (node_inx > ctx->nhosts)) {
+			slurm_seterrno(EINVAL);
+			rc = SLURM_ERROR;
 			break;
-
-		default:
+		}
+		uint32_array_pptr = (uint32_t **) va_arg(ap, void *);
+		*uint32_array_pptr = ctx->tids[node_inx];
+		break;
+		
+	case SLURM_STEP_CTX_RESP:
+		step_resp_pptr = (job_step_create_response_msg_t **) 
+			va_arg(ap, void *);
+		*step_resp_pptr = ctx->step_resp;
+		break;
+	case SLURM_STEP_CTX_CRED:
+		cred = (slurm_cred_t *) va_arg(ap, void *);
+		*cred = ctx->step_resp->cred;
+		break;
+	case SLURM_STEP_CTX_SWITCH_JOB:
+		switch_job = (switch_jobinfo_t *) va_arg(ap, void *);
+		*switch_job = ctx->step_resp->switch_job;
+		break;
+	case SLURM_STEP_CTX_NHOSTS:
+		uint32_ptr = (uint32_t *) va_arg(ap, void *);
+		*uint32_ptr = ctx->nhosts;
+		break;
+	case SLURM_STEP_CTX_CPUS:
+		uint32_array_pptr = (uint32_t **) va_arg(ap, void *);
+		*uint32_array_pptr = ctx->cpus;
+		break;
+	case SLURM_STEP_CTX_HOST:
+		node_inx = va_arg(ap, uint32_t);
+		if ((node_inx < 0) || (node_inx > ctx->nhosts)) {
 			slurm_seterrno(EINVAL);
 			rc = SLURM_ERROR;
+			break;
+		}
+		char_array_pptr = (char **) va_arg(ap, void *);
+		*char_array_pptr = ctx->host[node_inx];
+		break;
+	default:
+		slurm_seterrno(EINVAL);
+		rc = SLURM_ERROR;
 	}
 	va_end(ap);
 
@@ -246,40 +287,39 @@ slurm_step_ctx_set (slurm_step_ctx ctx, int ctx_key, ...)
 
 	va_start(ap, ctx_key);
 	switch (ctx_key) {
-		case SLURM_STEP_CTX_ARGS:
-			if (ctx->argv)
-				_xfree_char_array(&ctx->argv, ctx->argc);
-			ctx->argc = va_arg(ap, int);
-			if ((ctx->argc < 1) || (ctx->argc > 1024)) {
-				slurm_seterrno(EINVAL);
-				break;
-			}
-			_xcopy_char_array(&ctx->argv, va_arg(ap, char **), 
-					ctx->argc);
-			break;
-
-		case SLURM_STEP_CTX_CHDIR:
-			if (ctx->cwd)
-				xfree(ctx->cwd);
-			ctx->cwd = xstrdup(va_arg(ap, char *));
-			break;
-
-		case SLURM_STEP_CTX_ENV:
-			ctx->env_set = 1;
-			if (ctx->env)
-				_xfree_char_array(&ctx->env, ctx->envc);
-			ctx->envc = va_arg(ap, int);
-			if ((ctx->envc < 1) || (ctx->envc > 1024)) {
-				slurm_seterrno(EINVAL);
-				break;
-			}
-			_xcopy_char_array(&ctx->env, va_arg(ap, char **), 
-					ctx->envc);
+	case SLURM_STEP_CTX_ARGS:
+		if (ctx->argv)
+			_xfree_char_array(&ctx->argv, ctx->argc);
+		ctx->argc = va_arg(ap, int);
+		if ((ctx->argc < 1) || (ctx->argc > 1024)) {
+			slurm_seterrno(EINVAL);
 			break;
-
-		default:
+		}
+		_xcopy_char_array(&ctx->argv, va_arg(ap, char **), 
+				  ctx->argc);
+		break;
+
+	case SLURM_STEP_CTX_CHDIR:
+		if (ctx->cwd)
+			xfree(ctx->cwd);
+		ctx->cwd = xstrdup(va_arg(ap, char *));
+		break;
+
+	case SLURM_STEP_CTX_ENV:
+		ctx->env_set = 1;
+		if (ctx->env)
+			_xfree_char_array(&ctx->env, ctx->envc);
+		ctx->envc = va_arg(ap, int);
+		if ((ctx->envc < 1) || (ctx->envc > 1024)) {
 			slurm_seterrno(EINVAL);
-			rc = SLURM_ERROR;
+			break;
+		}
+		_xcopy_char_array(&ctx->env, va_arg(ap, char **), 
+				  ctx->envc);
+		break;
+	default:
+		slurm_seterrno(EINVAL);
+		rc = SLURM_ERROR;
 	}
 	va_end(ap);
 
@@ -348,9 +388,12 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array)
 	spawn_task_request_msg_t *msg_array_ptr;
 	int *sock_array;
 	slurm_msg_t *req_array_ptr;
-	int i, rc = SLURM_SUCCESS;
+	int i, j, rc = SLURM_SUCCESS;
 	uint16_t slurmd_debug = 0;
 	char *env_var;
+	hostlist_t hostlist = NULL;
+	hostlist_iterator_t itr = NULL;
+	char *host = NULL;
 
 	if ((ctx == NULL) ||
 	    (ctx->magic != STEP_CTX_MAGIC) ||
@@ -390,6 +433,10 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array)
 	msg_array_ptr = xmalloc(sizeof(spawn_task_request_msg_t) *
 			ctx->nhosts);
 	req_array_ptr = xmalloc(sizeof(slurm_msg_t) * ctx->nhosts);
+
+	hostlist = hostlist_create(ctx->alloc_resp->node_list);		
+	itr = hostlist_iterator_create(hostlist);
+
 	for (i=0; i<ctx->nhosts; i++) {
 		spawn_task_request_msg_t *r = &msg_array_ptr[i];
 		slurm_msg_t              *m = &req_array_ptr[i];
@@ -408,7 +455,6 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array)
 		r->nprocs	= ctx->num_tasks;
 		r->switch_job	= ctx->step_resp->switch_job; 
 		r->slurmd_debug	= slurmd_debug;
-
 		/* Task specific message contents */
 		r->global_task_id	= ctx->tids[i][0];
 		r->cpus_allocated	= ctx->cpus[i];
@@ -416,14 +462,28 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array)
 		r->io_port	= ntohs(sock_array[i]);
 		m->msg_type	= REQUEST_SPAWN_TASK;
 		m->data		= r;
-		memcpy(&m->address, &ctx->alloc_resp->node_addr[i], 
+
+		j=0; 
+  		while(host = hostlist_next(itr)) { 
+  			if(!strcmp(host,ctx->host[i])) {
+  				free(host);
+				break; 
+			}
+  			j++; 
+			free(host);
+  		}
+		debug2("using %d %s with %d tasks\n", j, ctx->host[i],
+		       r->nprocs);
+		hostlist_iterator_reset(itr);
+		memcpy(&m->address, &ctx->alloc_resp->node_addr[j], 
 			sizeof(slurm_addr));
 #if		_DEBUG
 		printf("tid=%d, fd=%d, port=%u, node_id=%u\n",
 			ctx->tids[i][0], fd_array[i], r->io_port, i);
 #endif
 	}
-
+	hostlist_iterator_destroy(itr);
+	hostlist_destroy(hostlist);
 	rc = _p_launch(req_array_ptr, ctx);
 
 	xfree(msg_array_ptr);
@@ -433,7 +493,6 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array)
 	return rc;
 }
 
-
 /*
  * slurm_spawn_kill - send the specified signal to an existing job step
  * IN ctx - job step context generated by slurm_step_ctx_create
@@ -535,10 +594,64 @@ static int _task_layout(slurm_step_ctx ctx)
 
 	if (ctx->task_dist == SLURM_DIST_CYCLIC)
 		return _task_layout_cyclic(ctx);
+	else if(ctx->task_dist == SLURM_DIST_HOSTFILE)
+		return _task_layout_hostfile(ctx);
 	else
 		return _task_layout_block(ctx);
 }
 
+/* use specific set run tasks on each host listed in hostfile
+ */
+static int _task_layout_hostfile(slurm_step_ctx ctx)
+{
+	int i=0, j, taskid = 0;
+	bool over_subscribe = false;
+	hostlist_iterator_t itr = NULL, itr_task = NULL;
+	char *host = NULL;
+	char *host_task = NULL;
+	hostlist_t job_alloc_hosts = NULL;
+	hostlist_t step_alloc_hosts = NULL;
+	
+	job_alloc_hosts = hostlist_create(ctx->alloc_resp->node_list);
+	itr = hostlist_iterator_create(job_alloc_hosts);
+	step_alloc_hosts = hostlist_create(ctx->step_resp->node_list);
+	itr_task = hostlist_iterator_create(step_alloc_hosts);
+	while(host = hostlist_next(itr)) {
+
+		ctx->tasks[i] = 0;
+		while(host_task = hostlist_next(itr_task)) {
+			if(!strcmp(host, host_task))
+				ctx->tasks[i]++;
+		}
+		debug2("%s got %d tasks\n",
+		       host,
+		       ctx->tasks[i]);
+		if(ctx->tasks[i] == 0)
+			goto reset_hosts;
+		ctx->tids[i] = xmalloc(sizeof(uint32_t) * ctx->tasks[i]);
+		hostlist_iterator_reset(itr_task);
+		taskid = 0;
+		j = 0;
+		while(host_task = hostlist_next(itr_task)) {
+			if(!strcmp(host, host_task)) {
+				ctx->tids[i][j] = taskid;
+				j++;
+			}
+			taskid++;
+			free(host_task);
+		}
+		i++;
+	reset_hosts:
+		hostlist_iterator_reset(itr_task);	
+		free(host);
+	}
+
+	hostlist_iterator_destroy(itr);
+	hostlist_iterator_destroy(itr_task);
+	hostlist_destroy(job_alloc_hosts);
+
+	return SLURM_SUCCESS;
+}
 
 /* to effectively deal with heterogeneous nodes, we fake a cyclic
  * distribution to figure out how many tasks go on each node and
diff --git a/src/common/dist_tasks.c b/src/common/dist_tasks.c
index d1706bf46dd..1df7deb265d 100644
--- a/src/common/dist_tasks.c
+++ b/src/common/dist_tasks.c
@@ -47,7 +47,6 @@
 #include "src/common/log.h"
 #include "src/common/xmalloc.h"
 
-
 /* 
  * distribute_tasks - determine how many tasks of a job will be run on each.
  *                    node. Distribution is influenced by number of cpus on
@@ -92,7 +91,6 @@ int *distribute_tasks(const char *mlist, uint16_t num_cpu_groups,
 	i = 0;
 	ncpus = 0;
 	while ((this_node_name = hostlist_shift(master_hl))) {
-
 		if (hostlist_find(task_hl, this_node_name) >= 0) {
 			if (i >= nnodes) {
 				fatal("Internal error: duplicate nodes? "
@@ -110,7 +108,6 @@ int *distribute_tasks(const char *mlist, uint16_t num_cpu_groups,
 	}
 	hostlist_destroy(master_hl);
 	hostlist_destroy(task_hl);
-
 	if (num_tasks >= ncpus) {
 		/*
 		 * Evenly overcommit tasks over the hosts
diff --git a/src/common/dist_tasks.h b/src/common/dist_tasks.h
index b347aebbea7..565a388bfb5 100644
--- a/src/common/dist_tasks.h
+++ b/src/common/dist_tasks.h
@@ -58,10 +58,10 @@
  * NOTE: allocates memory that should be xfreed by caller
  */
 int * distribute_tasks(const char *mlist,
-			uint16_t num_cpu_groups,
-			uint32_t *cpus_per_node,
-			uint32_t *cpu_count_reps,
-			const char *tlist,
-			uint32_t num_tasks);
+		       uint16_t num_cpu_groups,
+		       uint32_t *cpus_per_node,
+		       uint32_t *cpu_count_reps,
+		       const char *tlist,
+		       uint32_t num_tasks);
 
 #endif /* !_DIST_TASKS_H */
diff --git a/src/plugins/switch/federation/federation.c b/src/plugins/switch/federation/federation.c
index c16d3e7dada..48b19b4f6aa 100644
--- a/src/plugins/switch/federation/federation.c
+++ b/src/plugins/switch/federation/federation.c
@@ -1773,12 +1773,11 @@ fed_build_jobinfo(fed_jobinfo_t *jp, hostlist_t hl, int nprocs,
 		int min_procs_per_node;
 		int max_procs_per_node;
 
-		debug("Allocating windows in block mode");
+		debug("Allocating windows in non-cyclic mode");
 		nnodes = hostlist_count(hl);
 		full_node_cnt = nprocs % nnodes;
 		min_procs_per_node = nprocs / nnodes;
 		max_procs_per_node = (nprocs + nnodes - 1) / nnodes;
-	
 		proc_cnt = 0;
 		_lock();
 		for  (i = 0; i < nnodes; i++) {
@@ -1790,7 +1789,7 @@ fed_build_jobinfo(fed_jobinfo_t *jp, hostlist_t hl, int nprocs,
 				task_cnt = max_procs_per_node;
 			else
 				task_cnt = min_procs_per_node;
-			
+						
 			for (j = 0; j < task_cnt; j++) {
 				rc = _allocate_windows(jp->tables_per_task,
 						       jp->tableinfo,
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 237479dc699..ef19cfd929f 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -1146,16 +1146,20 @@ extern void build_node_details(struct job_record *job_ptr)
 	}
 
 	job_ptr->num_cpu_groups = 0;
-	job_ptr->node_cnt = bit_set_count(job_ptr->node_bitmap);
+	
+	/* Use hostlist here to insure ordering of info matches that of srun */
+	if ((host_list = hostlist_create(job_ptr->nodes)) == NULL)
+		fatal("hostlist_create error for %s: %m", this_node_name);
+	
+	job_ptr->node_cnt = hostlist_count(host_list);	
+
 	xrealloc(job_ptr->cpus_per_node, 
 		(sizeof(uint32_t) * job_ptr->node_cnt));
 	xrealloc(job_ptr->cpu_count_reps, 
 		(sizeof(uint32_t) * job_ptr->node_cnt));
 	xrealloc(job_ptr->node_addr, 
 		(sizeof(slurm_addr) * job_ptr->node_cnt));
-	/* Use hostlist here to insure ordering of info matches that of srun */
-	if ((host_list = hostlist_create(job_ptr->nodes)) == NULL)
-		fatal("hostlist_create error for %s: %m", job_ptr->nodes);
+	
 
         job_ptr->ntask_cnt = 0;
         xfree(job_ptr->ntask);
@@ -1176,17 +1180,16 @@ extern void build_node_details(struct job_record *job_ptr)
 				job_ptr->ntask[cr_count++] = usable_cpus;
 				if(error_code != SLURM_SUCCESS) {
 					xfree(job_ptr->ntask); 
-					free(this_node_name);
-					error("Invalid node %s in JobId=%u",
-						this_node_name, 
-						job_ptr->job_id);
+					error("Unable to get extra jobinfo "
+					      "from JobId=%u", 
+					      job_ptr->job_id);
 				}
 			} else if (slurmctld_conf.fast_schedule) {
 				usable_cpus = node_ptr->config_ptr->cpus;
 			} else {
 				usable_cpus = node_ptr->cpus;
 			}
-
+			
 			if (usable_cpus <= 0)
 				continue;
 			memcpy(&job_ptr->node_addr[node_inx++],
@@ -1196,11 +1199,12 @@ extern void build_node_details(struct job_record *job_ptr)
 			     usable_cpus)) {
 				cpu_inx++;
 				job_ptr->cpus_per_node[cpu_inx] =
-						usable_cpus;
+					usable_cpus;
+
 				job_ptr->cpu_count_reps[cpu_inx] = 1;
 			} else
 				job_ptr->cpu_count_reps[cpu_inx]++;
-
+			
 		} else {
 			error("Invalid node %s in JobId=%u",
 			      this_node_name, job_ptr->job_id);
@@ -1217,8 +1221,8 @@ extern void build_node_details(struct job_record *job_ptr)
 	if ((cr_enabled) && (error_code == SLURM_SUCCESS)) {
                 error_code = select_g_update_nodeinfo(job_ptr, SELECT_CR_USED_CPUS);
                 if(error_code != SLURM_SUCCESS)
-                      error("Invalid node %s in JobId=%u",
-                            this_node_name, job_ptr->job_id);
+                      error("Unable to update nodeinfo JobId=%u",
+                            job_ptr->job_id);
         }
 }
 
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 135bb940daf..71083e81675 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -1045,7 +1045,7 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg)
 	/* return result */
 	if (error_code) {
 		unlock_slurmctld(job_write_lock);
-		info("_slurm_rpc_job_step_create: %s", 
+		error("_slurm_rpc_job_step_create: %s", 
 			slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
@@ -1053,7 +1053,7 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg)
 		     step_rec->job_ptr->job_id, step_rec->step_id, TIME_STR);
 
 		job_step_resp.job_step_id = step_rec->step_id;
-		job_step_resp.node_list   = xstrdup(step_rec->step_node_list);
+		job_step_resp.node_list   = xstrdup(req_step_msg->node_list);
 		job_step_resp.cred        = slurm_cred;
 		job_step_resp.switch_job  = switch_copy_jobinfo(
 						step_rec->switch_job);
@@ -1279,7 +1279,7 @@ static void _slurm_rpc_old_job_alloc(slurm_msg_t * msg)
 			slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
-		debug2("_slurm_rpc_old_job_alloc JobId=%u NodeList=%s %s",
+		info("_slurm_rpc_old_job_alloc JobId=%u NodeList=%s %s",
 			job_desc_msg->job_id, job_ptr->nodes, TIME_STR);
 
 		/* send job_ID  and node_name_ptr */
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 0b2c68d40f4..9b1b9646b35 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -405,8 +405,11 @@ _pick_step_nodes (struct job_record  *job_ptr,
 				step_spec->node_list, job_ptr->job_id);
 			goto cleanup;
 		}
-	}
-	else if (step_spec->relative) {
+		if(step_spec->task_dist == SLURM_DIST_HOSTFILE) {
+			FREE_NULL_BITMAP(nodes_avail);
+			return nodes_picked;
+		}
+	} else if (step_spec->relative) {
 		/* Remove first (step_spec->relative) nodes from  
 		 * available list */
 		bitstr_t *relative_nodes = NULL;
@@ -420,14 +423,13 @@ _pick_step_nodes (struct job_record  *job_ptr,
 		bit_not (relative_nodes);
 		bit_and (nodes_avail, relative_nodes);
 		bit_free (relative_nodes);
-	}
-	else {
+	} else {
 		nodes_picked = bit_alloc (bit_size (nodes_avail) );
 		if (nodes_picked == NULL)
 			fatal("bit_alloc malloc failure");
 	}
 
-	/* if user specifies step needs a specific processor count and  */
+	/* istep_specs->node_listf user specifies step needs a specific processor count and  */
 	/* all nodes have the same processor count, just translate this to */
 	/* a node count */
 	if (step_spec->cpu_count && (job_ptr->num_cpu_groups == 1)) {
@@ -524,7 +526,8 @@ step_create ( job_step_create_request_msg_t *step_specs,
 		return ESLURM_ALREADY_DONE;
 
 	if ((step_specs->task_dist != SLURM_DIST_CYCLIC) &&
-	    (step_specs->task_dist != SLURM_DIST_BLOCK))
+	    (step_specs->task_dist != SLURM_DIST_BLOCK) &&
+	    (step_specs->task_dist != SLURM_DIST_HOSTFILE))
 		return ESLURM_BAD_DIST;
 
 	if (job_ptr->kill_on_step_done)
@@ -537,7 +540,7 @@ step_create ( job_step_create_request_msg_t *step_specs,
 	if (nodeset == NULL)
 		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE ;
 	node_count = bit_set_count(nodeset);
-
+	
 	if (step_specs->num_tasks == NO_VAL) {
 		if (step_specs->cpu_count != NO_VAL)
 			step_specs->num_tasks = step_specs->cpu_count;
@@ -553,7 +556,10 @@ step_create ( job_step_create_request_msg_t *step_specs,
 		fatal ("create_step_record failed with no memory");
 
 	/* set the step_record values */
-	step_ptr->step_node_list = bitmap2node_name(nodeset);
+	/* Here is where the node list is set for the job */
+	step_ptr->step_node_list = xstrdup(step_specs->node_list); 
+	xfree(step_specs->node_list);
+	step_specs->node_list = bitmap2node_name(nodeset);
 	step_ptr->step_node_bitmap = nodeset;
 	step_ptr->cyclic_alloc = 
 		(uint16_t) (step_specs->task_dist == SLURM_DIST_CYCLIC);
diff --git a/src/srun/allocate.c b/src/srun/allocate.c
index 96824b85f88..29802445b65 100644
--- a/src/srun/allocate.c
+++ b/src/srun/allocate.c
@@ -63,8 +63,8 @@ static void  _wait_for_resources(resource_allocation_response_msg_t **resp);
 static bool  _retry();
 static void  _intr_handler(int signo);
 
-static job_step_create_request_msg_t * _step_req_create(srun_job_t *j);
-static void _step_req_destroy(job_step_create_request_msg_t *r);
+/* static job_step_create_request_msg_t * _step_req_create(srun_job_t *j); */
+/* static void _step_req_destroy(job_step_create_request_msg_t *r); */
 
 static sig_atomic_t destroy_job = 0;
 static srun_job_t *allocate_job = NULL;
@@ -462,77 +462,79 @@ job_desc_msg_destroy(job_desc_msg_t *j)
 	}
 }
 
-static job_step_create_request_msg_t *
-_step_req_create(srun_job_t *j)
-{
-	job_step_create_request_msg_t *r = xmalloc(sizeof(*r));
-	r->job_id     = j->jobid;
-	r->user_id    = opt.uid;
-	r->node_count = j->nhosts; 
-	r->cpu_count  = opt.overcommit ? j->nhosts 
-		                       : (opt.nprocs*opt.cpus_per_task);
-	r->num_tasks  = opt.nprocs;
-	r->node_list  = j->nodelist;
-	r->network    = opt.network;
-	r->name       = opt.job_name;
-	r->relative   = false;      /* XXX fix this oneday */
-
-	switch (opt.distribution) {
-	case SRUN_DIST_UNKNOWN:
-		r->task_dist = (opt.nprocs <= j->nhosts) ? SLURM_DIST_CYCLIC
-			                                 : SLURM_DIST_BLOCK;
-		break;
-	case SRUN_DIST_CYCLIC:
-		r->task_dist = SLURM_DIST_CYCLIC;
-		break;
-	default: /* (opt.distribution == SRUN_DIST_BLOCK) */
-		r->task_dist = SLURM_DIST_BLOCK;
-		break;
-	}
-
-	if (slurmctld_comm_addr.port) {
-		r->host = xstrdup(slurmctld_comm_addr.hostname);
-		r->port = slurmctld_comm_addr.port;
-	}
-
-	return(r);
-}
-
-static void
-_step_req_destroy(job_step_create_request_msg_t *r)
-{
-	if (r) {
-		xfree(r->host);
-		xfree(r);
-	}
-}
-
-int
-create_job_step(srun_job_t *job)
-{
-	job_step_create_request_msg_t  *req  = NULL;
-	job_step_create_response_msg_t *resp = NULL;
-
-	if (!(req = _step_req_create(job))) { 
-		error ("Unable to allocate step request message");
-		return -1;
-	}
-	if ((slurm_job_step_create(req, &resp) < 0) || (resp == NULL)) { 
-		error ("Unable to create job step: %m");
-		return -1;
-	}
-
-	job->stepid  = resp->job_step_id;
-	job->cred    = resp->cred;
-	job->switch_job = resp->switch_job;
-	/* 
-	 * Recreate filenames which may depend upon step id
-	 */
-	job_update_io_fnames(job);
-
-	_step_req_destroy(req);
-	return 0;
-}
+/* static job_step_create_request_msg_t * */
+/* _step_req_create(srun_job_t *j) */
+/* { */
+/* 	job_step_create_request_msg_t *r = xmalloc(sizeof(*r)); */
+/* 	r->job_id     = j->jobid; */
+/* 	r->user_id    = opt.uid; */
+/* 	r->node_count = j->nhosts;  */
+/* 	r->cpu_count  = opt.overcommit ? j->nhosts  */
+/* 		                       : (opt.nprocs*opt.cpus_per_task); */
+/* 	r->num_tasks  = opt.nprocs; */
+/* 	r->node_list  = xstrdup(j->nodelist); */
+/* 	r->network    = opt.network; */
+/* 	r->name       = opt.job_name; */
+/* 	r->relative   = false;      /\* XXX fix this oneday *\/ */
+
+/* 	switch (opt.distribution) { */
+/* 	case SRUN_DIST_UNKNOWN: */
+/* 		r->task_dist = (opt.nprocs <= j->nhosts) ? SLURM_DIST_CYCLIC */
+/* 			                                 : SLURM_DIST_BLOCK; */
+/* 		break; */
+/* 	case SRUN_DIST_CYCLIC: */
+/* 		r->task_dist = SLURM_DIST_CYCLIC; */
+/* 		break; */
+/* 	default: /\* (opt.distribution == SRUN_DIST_BLOCK) *\/ */
+/* 		r->task_dist = SLURM_DIST_BLOCK; */
+/* 		break; */
+/* 	} */
+
+/* 	if (slurmctld_comm_addr.port) { */
+/* 		r->host = xstrdup(slurmctld_comm_addr.hostname); */
+/* 		r->port = slurmctld_comm_addr.port; */
+/* 	} */
+
+/* 	return(r); */
+/* } */
+
+/* static void */
+/* _step_req_destroy(job_step_create_request_msg_t *r) */
+/* { */
+/* 	if (r) { */
+/* 		xfree(r->host); */
+/* 		xfree(r->node_list); */
+/* 		xfree(r); */
+/* 	} */
+/* } */
+
+/* int */
+/* create_job_step(srun_job_t *job) */
+/* { */
+/* 	job_step_create_request_msg_t  *req  = NULL; */
+/* 	job_step_create_response_msg_t *resp = NULL; */
+/* 	char *temp = NULL; */
+
+/* 	if (!(req = _step_req_create(job))) {  */
+/* 		error ("Unable to allocate step request message"); */
+/* 		return -1; */
+/* 	} */
+/* 	if ((slurm_job_step_create(req, &resp) < 0) || (resp == NULL)) {  */
+/* 		error ("Unable to create job step: %m"); */
+/* 		return -1; */
+/* 	} */
+	
+/* 	job->stepid  = resp->job_step_id; */
+/* 	job->cred    = resp->cred; */
+/* 	job->switch_job = resp->switch_job; */
+/* 	/\*  */
+/* 	 * Recreate filenames which may depend upon step id */
+/* 	 *\/ */
+/* 	job_update_io_fnames(job); */
+
+/* 	_step_req_destroy(req); */
+/* 	return 0; */
+/* } */
 
 void 
 set_allocate_job(srun_job_t *job) 
diff --git a/src/srun/allocate.h b/src/srun/allocate.h
index 7411f7006a0..fe7e8d9ff3e 100644
--- a/src/srun/allocate.h
+++ b/src/srun/allocate.h
@@ -83,7 +83,7 @@ uint32_t jobid_from_env(void);
  *
  * Returns -1 if job step creation failure, 0 otherwise
  */
-int create_job_step(srun_job_t *j);
+/* int create_job_step(srun_job_t *j); */
 
 /* set the job for debugging purpose */
 void set_allocate_job(srun_job_t *job);
diff --git a/src/srun/launch.c b/src/srun/launch.c
index edfe7b6ef13..15da6932c0b 100644
--- a/src/srun/launch.c
+++ b/src/srun/launch.c
@@ -37,6 +37,7 @@
 
 #include "src/common/log.h"
 #include "src/common/macros.h"
+#include "src/common/hostlist.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/xmalloc.h"
 #include "src/common/xsignal.h"
@@ -105,13 +106,13 @@ launch(void *arg)
 	slurm_msg_t *req_array_ptr;
 	launch_tasks_request_msg_t *msg_array_ptr;
 	srun_job_t *job = (srun_job_t *) arg;
-	int i, my_envc;
-	char hostname[MAXHOSTNAMELEN];
+	int i, j, my_envc;
+	hostlist_t hostlist = NULL;
+	hostlist_iterator_t itr = NULL;
+	char *host = NULL;
 
 	update_job_state(job, SRUN_JOB_LAUNCHING);
-	if (gethostname(hostname, MAXHOSTNAMELEN) < 0)
-		error("gethostname: %m");
-
+	
 	debug("going to launch %d tasks on %d hosts", opt.nprocs, job->nhosts);
 	debug("sending to slurmd port %d", slurm_get_slurmd_port());
 
@@ -119,6 +120,10 @@ launch(void *arg)
 		xmalloc(sizeof(launch_tasks_request_msg_t)*job->nhosts);
 	req_array_ptr = xmalloc(sizeof(slurm_msg_t) * job->nhosts);
 	my_envc = envcount(environ);
+
+	hostlist = hostlist_create(job->nodelist);		
+	itr = hostlist_iterator_create(hostlist);
+
 	for (i = 0; i < job->nhosts; i++) {
 		launch_tasks_request_msg_t *r = &msg_array_ptr[i];
 		slurm_msg_t                *m = &req_array_ptr[i];
@@ -159,11 +164,27 @@ launch(void *arg)
 		r->srun_node_id    = (uint32_t)i;
 		r->io_port         = ntohs(job->listenport[i%job->num_listen]);
 		r->resp_port       = ntohs(job->jaddr[i%job->njfds].sin_port);
+		
 		m->msg_type        = REQUEST_LAUNCH_TASKS;
-		m->data            = &msg_array_ptr[i];
-		memcpy(&m->address, &job->slurmd_addr[i], sizeof(slurm_addr));
+		m->data            = r;
+		j=0; 
+  		while(host = hostlist_next(itr)) { 
+			if(!strcmp(host,job->host[i])) {
+  				free(host);
+				break; 
+			}
+  			j++; 
+			free(host);
+  		}
+		hostlist_iterator_reset(itr);
+		debug2("using %d %s with %d tasks\n", j, job->host[i],
+		       r->nprocs);
+		
+		memcpy(&m->address, &job->slurmd_addr[j], sizeof(slurm_addr));
 	}
-
+	hostlist_iterator_destroy(itr);
+	hostlist_destroy(hostlist);
+	
 	_p_launch(req_array_ptr, job);
 
 	xfree(msg_array_ptr);
@@ -313,7 +334,6 @@ static void _p_launch(slurm_msg_t *req, srun_job_t *job)
 
 	thd = xmalloc (job->nhosts * sizeof (thd_t));
 	for (i = 0; i < job->nhosts; i++) {
-
 		if (job->ntask[i] == 0)	{	/* No tasks for this node */
 			debug("Node %s is unused",job->host[i]);
 			job->host_state[i] = SRUN_HOST_REPLIED;
@@ -451,7 +471,8 @@ static void * _p_launch_task(void *arg)
 	if (_send_msg_rc(req) < 0) {	/* Has timeout */
 
 		if (errno != EINTR)
-			verbose("launch error on %s: %m", job->host[nodeid]);
+			verbose("fisrt launch error on %s: %m", 
+				job->host[nodeid]);
 
 		if ((errno != ETIMEDOUT) 
 		    && (job->state == SRUN_JOB_LAUNCHING)
@@ -464,7 +485,8 @@ static void * _p_launch_task(void *arg)
 		if (errno == EINTR)
 			verbose("launch on %s canceled", job->host[nodeid]);
 		else
-			error("launch error on %s: %m", job->host[nodeid]);
+			error("second launch error on %s: %m", 
+			      job->host[nodeid]);
 
 		_update_failed_node(job, nodeid);
 
diff --git a/src/srun/msg.c b/src/srun/msg.c
index 07950ec28e3..61ef3d85f4c 100644
--- a/src/srun/msg.c
+++ b/src/srun/msg.c
@@ -265,7 +265,6 @@ _process_launch_resp(srun_job_t *job, launch_tasks_response_msg_t *msg)
 		error ("Bad launch response from %s", msg->node_name);
 		return;
 	}
-
 	pthread_mutex_lock(&job->task_mutex);
 	job->host_state[msg->srun_node_id] = SRUN_HOST_REPLIED;
 	pthread_mutex_unlock(&job->task_mutex);
@@ -368,8 +367,9 @@ _launch_handler(srun_job_t *job, slurm_msg_t *resp)
 	launch_tasks_response_msg_t *msg = resp->data;
 	pipe_enum_t pipe_enum = PIPE_HOST_STATE;
 	
-	debug2("received launch resp from %s nodeid=%d", msg->node_name,
-			msg->srun_node_id);
+	debug3("received launch resp from %s nodeid=%d", 
+	       msg->node_name,
+	       msg->srun_node_id);
 	
 	if (msg->return_code != 0)  {
 
@@ -412,8 +412,10 @@ static void
 _confirm_launch_complete(srun_job_t *job)
 {
 	int i;
-
+	printf("job->nhosts %d\n",job->nhosts);
+		
 	for (i=0; i<job->nhosts; i++) {
+		printf("job->nhosts %d\n",job->nhosts);
 		if (job->host_state[i] != SRUN_HOST_REPLIED) {
 			error ("Node %s not responding, terminating job step",
 			       job->host[i]);
@@ -656,7 +658,7 @@ _handle_msg(srun_job_t *job, slurm_msg_t *msg)
 	uid_t uid     = getuid();
 	srun_timeout_msg_t *to;
 	srun_node_fail_msg_t *nf;
-
+	
 	if ((req_uid != slurm_uid) && (req_uid != 0) && (req_uid != uid)) {
 		error ("Security violation, slurm message from uid %u", 
 		       (unsigned int) req_uid);
@@ -856,21 +858,21 @@ _msg_thr_poll(srun_job_t *job)
 {
 	struct pollfd *fds;
 	int i;
-
+	
 	fds = xmalloc((job->njfds + 1) * sizeof(*fds));
 
 	_set_jfds_nonblocking(job);
-
+		
 	for (i = 0; i < job->njfds; i++)
 		_poll_set_rd(fds[i], job->jfd[i]);
 	_poll_set_rd(fds[i], slurmctld_fd);
-
+	
 	while (!_job_msg_done(job)) {
 		if (_do_poll(job, fds, _get_next_timeout(job)) == 0) {
 			_do_poll_timeout(job);
 			continue;
 		}
-
+		
 		for (i = 0; i < (job->njfds + 1) ; i++) {
 			unsigned short revents = fds[i].revents;
 			if ((revents & POLLERR) || 
@@ -882,6 +884,7 @@ _msg_thr_poll(srun_job_t *job)
 		}
 		
 	}
+	
 	xfree(fds);	/* if we were to break out of while loop */
 }
 
@@ -1029,7 +1032,8 @@ msg_thr_create(srun_job_t *job)
 	for (i = 0; i < job->njfds; i++) {
 		if ((job->jfd[i] = slurm_init_msg_engine_port(0)) < 0)
 			fatal("init_msg_engine_port: %m");
-		if (slurm_get_stream_addr(job->jfd[i], &job->jaddr[i]) 
+		if (slurm_get_stream_addr(job->jfd[i], 
+					  &job->jaddr[i]) 
 		    < 0)
 			fatal("slurm_get_stream_addr: %m");
 		debug("initialized job control port %d\n",
diff --git a/src/srun/opt.c b/src/srun/opt.c
index 58b31c91d8a..9c3b38112f4 100644
--- a/src/srun/opt.c
+++ b/src/srun/opt.c
@@ -156,7 +156,7 @@ static bool  _under_parallel_debugger(void);
 
 static void  _usage(void);
 static bool  _valid_node_list(char **node_list_pptr);
-static enum  distribution_t _verify_dist_type(const char *arg);
+static enum  task_dist_states _verify_dist_type(const char *arg);
 static bool  _verify_node_count(const char *arg, int *min, int *max);
 static int   _verify_geometry(const char *arg, int *geometry);
 static int   _verify_conn_type(const char *arg);
@@ -228,17 +228,19 @@ static bool _valid_node_list(char **node_list_pptr)
 
 /* 
  * verify that a distribution type in arg is of a known form
- * returns the distribution_t or SRUN_DIST_UNKNOWN
+ * returns the task_dist_states or SLURM_DIST_UNKNOWN
  */
-static enum distribution_t _verify_dist_type(const char *arg)
+static enum task_dist_states _verify_dist_type(const char *arg)
 {
 	int len = strlen(arg);
-	enum distribution_t result = SRUN_DIST_UNKNOWN;
+	enum task_dist_states result = SLURM_DIST_UNKNOWN;
 
 	if (strncasecmp(arg, "cyclic", len) == 0)
-		result = SRUN_DIST_CYCLIC;
+		result = SLURM_DIST_CYCLIC;
 	else if (strncasecmp(arg, "block", len) == 0)
-		result = SRUN_DIST_BLOCK;
+		result = SLURM_DIST_BLOCK;
+	else if (strncasecmp(arg, "hostfile", len) == 0)
+		result = SLURM_DIST_HOSTFILE;
 
 	return result;
 }
@@ -461,7 +463,7 @@ static void _opt_default()
 	opt.dependency = NO_VAL;
 	opt.account  = NULL;
 
-	opt.distribution = SRUN_DIST_UNKNOWN;
+	opt.distribution = SLURM_DIST_UNKNOWN;
 
 	opt.ofname = NULL;
 	opt.ifname = NULL;
@@ -607,7 +609,7 @@ static void
 _process_env_var(env_vars_t *e, const char *val)
 {
 	char *end = NULL;
-	enum distribution_t dt;
+	enum task_dist_states dt;
 
 	debug2("now processing env var %s=%s", e->var, val);
 
@@ -637,7 +639,7 @@ _process_env_var(env_vars_t *e, const char *val)
 
 	case OPT_DISTRIB:
 	    dt = _verify_dist_type(val);
-	    if (dt == SRUN_DIST_UNKNOWN) {
+	    if (dt == SLURM_DIST_UNKNOWN) {
 		    error("\"%s=%s\" -- invalid distribution type. " 
 		          "ignoring...", e->var, val);
 	    } else 
@@ -933,7 +935,7 @@ void set_options(const int argc, char **argv, int first)
 				break;
 						
 			opt.distribution = _verify_dist_type(optarg);
-			if (opt.distribution == SRUN_DIST_UNKNOWN) {
+			if (opt.distribution == SLURM_DIST_UNKNOWN) {
 				error("distribution type `%s' " 
 				      "is not recognized", optarg);
 				exit(1);
@@ -1605,7 +1607,7 @@ static void _opt_list()
 	info("partition      : %s",
 	     opt.partition == NULL ? "default" : opt.partition);
 	info("job name       : `%s'", opt.job_name);
-	info("distribution   : %s", format_distribution_t(opt.distribution));
+	info("distribution   : %s", format_task_dist_states(opt.distribution));
 	info("core format    : %s", core_format_name (opt.core_type));
 	info("verbose        : %d", _verbose);
 	info("slurmd_debug   : %d", opt.slurmd_debug);
diff --git a/src/srun/opt.h b/src/srun/opt.h
index 6d748cd4986..2a641a0cc9b 100644
--- a/src/srun/opt.h
+++ b/src/srun/opt.h
@@ -62,8 +62,9 @@ enum modes {
 
 enum modes mode;
 
-#define format_distribution_t(t) (t == SRUN_DIST_BLOCK) ? "block" :   \
-		                 (t == SRUN_DIST_CYCLIC) ? "cyclic" : \
+#define format_task_dist_states(t) (t == SLURM_DIST_BLOCK) ? "block" :   \
+		                 (t == SLURM_DIST_CYCLIC) ? "cyclic" : \
+			         (t == SLURM_DIST_HOSTFILE) ? "hostfile" : \
 			         "unknown"
 
 enum io_t {
@@ -97,7 +98,7 @@ typedef struct srun_options {
 	bool nodes_set;		/* true if nodes explicitly set */
 	int  time_limit;	/* --time,   -t			*/
 	char *partition;	/* --partition=n,   -p n   	*/
-	enum distribution_t
+	enum task_dist_states
 		distribution;	/* --distribution=, -m dist	*/
 	char *job_name;		/* --job-name=,     -J name	*/
 	unsigned int jobid;     /* --jobid=jobid                */
diff --git a/src/srun/reattach.c b/src/srun/reattach.c
index 0234c2f0967..729faa25183 100644
--- a/src/srun/reattach.c
+++ b/src/srun/reattach.c
@@ -306,8 +306,12 @@ _attach_to_job(srun_job_t *job)
 		r->job_id          = job->jobid;
 		r->job_step_id     = job->stepid;
 		r->srun_node_id    = (uint32_t) i;
-		r->io_port         = ntohs(job->listenport[i%job->num_listen]);
-		r->resp_port       = ntohs(job->jaddr[i%job->njfds].sin_port);
+		r->io_port         = 
+			ntohs(job->
+			      listenport[i%job->num_listen]);
+		r->resp_port       = 
+			ntohs(job->
+			      jaddr[i%job->njfds].sin_port);
 		r->cred            = job->cred;
 
 
diff --git a/src/srun/srun.c b/src/srun/srun.c
index 2586c94205e..7228ac0e133 100644
--- a/src/srun/srun.c
+++ b/src/srun/srun.c
@@ -115,7 +115,7 @@ static int   _run_srun_script (srun_job_t *job, char *script);
 int srun(int ac, char **av)
 {
 	allocation_resp *resp;
-	srun_job_t *job;
+	srun_job_t *job = NULL;
 	char *task_cnt, *bgl_part_id = NULL;
 	int exitcode = 0;
 	env_t *env = xmalloc(sizeof(env_t));
@@ -190,7 +190,9 @@ int srun(int ac, char **av)
 			info ("Warning: unable to assume uid=%lu\n", opt.uid);
 		if (_verbose)
 			_print_job_information(resp);
-		job = job_create_allocation(resp); 
+		
+		job = job_create_allocation(resp);
+		
 		if (msg_thr_create(job) < 0)
 			job_fatal(job, "Unable to create msg thread");
 		exitcode = _run_job_script(job, env);
@@ -209,11 +211,13 @@ int srun(int ac, char **av)
 		}
 		if (job_resp_hack_for_step(resp))	/* FIXME */
 			exit(1);
+		
 		job = job_create_allocation(resp);
+		
 		job->old_job = true;
 		sig_setup_sigmask();
-		if (create_job_step(job) < 0)
-			exit(1);
+		build_step_ctx(job);
+	
 		slurm_free_resource_allocation_response_msg(resp);
 		
 	} else if (mode == MODE_ATTACH) {
@@ -226,13 +230,11 @@ int srun(int ac, char **av)
 			exit(1);
 		if (_verbose)
 			_print_job_information(resp);
-
-		job = job_create_allocation(resp); 
-		if (create_job_step(job) < 0) {
-			srun_job_destroy(job, 0);
-			exit(1);
-		}
-		slurm_free_resource_allocation_response_msg(resp);
+						
+		job = job_create_allocation(resp);
+		build_step_ctx(job);
+		
+ 		slurm_free_resource_allocation_response_msg(resp);
 	}
 
 	/*
@@ -277,10 +279,10 @@ int srun(int ac, char **av)
 
 	if (sig_thr_create(job) < 0)
 		job_fatal(job, "Unable to create signals thread: %m");
-
+	
 	if (launch_thr_create(job) < 0)
-		job_fatal(job, "Unable to create launch thread: %m");
-
+ 		job_fatal(job, "Unable to create launch thread: %m");
+	
 	/* wait for job to terminate 
 	 */
 	slurm_mutex_lock(&job->state_mutex);
@@ -340,7 +342,8 @@ _task_count_string (srun_job_t *job)
 	int i, last_val, last_cnt;
 	char tmp[16];
 	char *str = xstrdup ("");
-
+	if(job->ntasks == 0)
+		return (str);
 	last_val = job->ntask[0];
 	last_cnt = 1;
 	for (i=1; i<job->nhosts; i++) {
@@ -367,7 +370,7 @@ _task_count_string (srun_job_t *job)
 static void
 _switch_standalone(srun_job_t *job)
 {
-	int cyclic = (opt.distribution == SRUN_DIST_CYCLIC);
+	int cyclic = (opt.distribution == SLURM_DIST_CYCLIC);
 
 	if (switch_alloc_jobinfo(&job->switch_job) < 0)
 		fatal("switch_alloc_jobinfo: %m");
@@ -816,7 +819,7 @@ static int _run_job_script (srun_job_t *job, env_t *env)
 		env->jobid = job->jobid;
 		env->nhosts = job->nhosts;
 		env->nodelist = job->nodelist;
-		env->task_count = _task_count_string (job);
+		//env->task_count = _task_count_string (job);
 	}
 	
 	if (setup_env(env) != SLURM_SUCCESS) 
diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c
index f89c4d45918..03f77cd142b 100644
--- a/src/srun/srun_job.c
+++ b/src/srun/srun_job.c
@@ -52,6 +52,7 @@
 #include "src/srun/fname.h"
 #include "src/srun/attach.h"
 #include "src/srun/io.h"
+#include "src/srun/msg.h"
 
 
 /*
@@ -80,8 +81,11 @@ static void       _dist_cyclic(srun_job_t *job);
 static inline int _estimate_nports(int nclients, int cli_per_port);
 static int        _compute_task_count(allocation_info_t *info);
 static void       _set_nprocs(allocation_info_t *info);
-static srun_job_t *    _job_create_internal(allocation_info_t *info);
+static srun_job_t *_job_create_internal(allocation_info_t *info);
+static srun_job_t *_job_create_structure(allocation_info_t *info);
 static void       _job_fake_cred(srun_job_t *job);
+static void       _job_noalloc_step_create(srun_job_t *job,
+					   allocation_info_t *info);
 static int        _job_resp_add_nodes(bitstr_t *req_bitmap, 
 				bitstr_t *exc_bitmap, int node_cnt);
 static int        _job_resp_bitmap(hostlist_t resp_node_hl, char *nodelist, 
@@ -135,34 +139,6 @@ _dist_cyclic(srun_job_t *job)
 	}
 }
 
-/*
- * Create an srun job structure from a resource allocation response msg
- */
-srun_job_t *
-job_create_allocation(resource_allocation_response_msg_t *resp)
-{
-	srun_job_t *job;
-	allocation_info_t *i = xmalloc(sizeof(*i));
-
-	i->nodelist       = _normalize_hostlist(resp->node_list);
-	i->nnodes	  = resp->node_cnt;
-	i->jobid          = resp->job_id;
-	i->stepid         = NO_VAL;
-	i->num_cpu_groups = resp->num_cpu_groups;
-	i->cpus_per_node  = resp->cpus_per_node;
-	i->cpu_count_reps = resp->cpu_count_reps;
-	i->addrs          = resp->node_addr;
-	i->select_jobinfo = select_g_copy_jobinfo(resp->select_jobinfo);
-
-	job = _job_create_internal(i);
-
-	xfree(i->nodelist);
-	xfree(i);
-
-	return (job);
-}
-
-
 /* 
  * Create an srun job structure w/out an allocation response msg.
  * (i.e. use the command line options)
@@ -203,7 +179,8 @@ job_create_noalloc(void)
 	/* 
 	 * Create job, then fill in host addresses
 	 */
-	job = _job_create_internal(ai);
+	job = _job_create_structure(ai);
+	_job_noalloc_step_create(job, ai);
 
 	for (i = 0; i < job->nhosts; i++) {
 		char *nd = get_conf_node_hostname(job->host[i]);
@@ -213,13 +190,243 @@ job_create_noalloc(void)
 	}
 
 	_job_fake_cred(job);
-
+	
    error:
 	xfree(ai);
 	return (job);
 
 }
 
+/*
+ * Create an srun job structure from a resource allocation response msg
+ */
+extern srun_job_t *
+job_create_allocation(resource_allocation_response_msg_t *resp)
+{
+	srun_job_t *job;
+	allocation_info_t *i = xmalloc(sizeof(*i));
+
+	i->nodelist       = _normalize_hostlist(resp->node_list);
+	i->nnodes	  = resp->node_cnt;
+	i->jobid          = resp->job_id;
+	i->stepid         = NO_VAL;
+	i->num_cpu_groups = resp->num_cpu_groups;
+	i->cpus_per_node  = resp->cpus_per_node;
+	i->cpu_count_reps = resp->cpu_count_reps;
+	i->addrs          = resp->node_addr;
+	i->select_jobinfo = select_g_copy_jobinfo(resp->select_jobinfo);
+
+	job = _job_create_structure(i);
+
+	xfree(i->nodelist);
+	xfree(i);
+
+	return (job);
+}
+
+/*
+ * Create an srun job structure from a resource allocation response msg
+ */
+static srun_job_t *
+_job_create_structure(allocation_info_t *info)
+{
+	srun_job_t *job = xmalloc(sizeof(srun_job_t));
+	int i, cpu_inx, cpu_cnt;
+	
+	debug2("creating job with %d tasks", opt.nprocs);
+
+	slurm_mutex_init(&job->state_mutex);
+	pthread_cond_init(&job->state_cond, NULL);
+	job->state = SRUN_JOB_INIT;
+
+ 	job->nodelist = xstrdup(info->nodelist); 
+	job->stepid  = info->stepid;
+	
+#ifdef HAVE_FRONT_END	/* Limited job step support */
+	/* All jobs execute through front-end on Blue Gene/L.
+	 * Normally we would not permit execution of job steps,
+	 * but can fake it by just allocating all tasks to
+	 * one of the allocated nodes. */
+	job->nhosts    = 1;
+	opt.overcommit = true;
+#else
+	job->nhosts   = info->nnodes;
+#endif
+
+	job->select_jobinfo = info->select_jobinfo;
+	job->jobid   = info->jobid;
+	
+	job->task_prolog = xstrdup(opt.task_prolog);
+	job->task_epilog = xstrdup(opt.task_epilog);
+	/* Compute number of file descriptors / Ports needed for Job 
+	 * control info server
+	 */
+	job->njfds = _estimate_nports(opt.nprocs, 48);
+	debug3("njfds = %d", job->njfds);
+	job->jfd = (slurm_fd *)
+		xmalloc(job->njfds * sizeof(slurm_fd));
+	job->jaddr = (slurm_addr *) 
+		xmalloc(job->njfds * sizeof(slurm_addr));
+	/* Compute number of listening sockets needed to allow
+	 * all of the slurmds to establish IO streams with srun, without
+	 * overstressing the TCP/IP backoff/retry algorithm
+	 */
+	job->num_listen = _estimate_nports(opt.nprocs, 64);
+	job->listensock = (int *) 
+		xmalloc(job->num_listen * sizeof(int));
+	job->listenport = (int *) 
+		xmalloc(job->num_listen * sizeof(int));
+	
+	job->hostid = xmalloc(opt.nprocs * sizeof(uint32_t));
+	
+ 	slurm_mutex_init(&job->task_mutex);
+	
+	job->old_job = false;
+	job->removed = false;
+	job->signaled = false;
+	job->rc       = -1;
+	
+	/* 
+	 *  Initialize Launch and Exit timeout values
+	 */
+	job->ltimeout = 0;
+	job->etimeout = 0;
+	
+	
+	job->eio = eio_handle_create();
+	job->ioservers_ready = 0;
+	/* "nhosts" number of IO protocol sockets */
+	job->ioserver = (eio_obj_t **)xmalloc(job->nhosts*sizeof(eio_obj_t *));
+	
+	job->slurmd_addr = xmalloc(job->nhosts * sizeof(slurm_addr));
+	if (info->addrs)
+		memcpy( job->slurmd_addr, info->addrs,
+			sizeof(slurm_addr)*job->nhosts);
+
+	job->free_incoming = list_create(NULL); /* FIXME! Needs destructor */
+	for (i = 0; i < STDIO_MAX_FREE_BUF; i++) {
+		list_enqueue(job->free_incoming, alloc_io_buf());
+	}
+	job->free_outgoing = list_create(NULL); /* FIXME! Needs destructor */
+	for (i = 0; i < STDIO_MAX_FREE_BUF; i++) {
+		list_enqueue(job->free_outgoing, alloc_io_buf());
+	}
+	
+	/* ntask task states and statii*/
+	job->task_state  =  xmalloc(opt.nprocs * sizeof(srun_task_state_t));
+	job->tstatus	 =  xmalloc(opt.nprocs * sizeof(int));
+	job->free_incoming = list_create(NULL); /* FIXME! Needs destructor */
+	job->incoming_count = 0;
+	for (i = 0; i < STDIO_MAX_FREE_BUF; i++) {
+		list_enqueue(job->free_incoming, alloc_io_buf());
+	}
+	job->free_outgoing = list_create(NULL); /* FIXME! Needs destructor */
+	job->outgoing_count = 0;
+	for (i = 0; i < STDIO_MAX_FREE_BUF; i++) {
+		list_enqueue(job->free_outgoing, alloc_io_buf());
+	}
+	
+	job_update_io_fnames(job);
+	
+	return (job);
+	
+	
+}
+
+extern int build_step_ctx(srun_job_t *job)
+{
+	job_step_create_request_msg_t  *r  = NULL;
+	uint32_t step_id;
+	int i;
+	char *temp = NULL;
+	r = xmalloc(sizeof(job_step_create_request_msg_t));
+	if (r == NULL) {
+		error("calloc error");
+		return -1;
+	}
+	r->job_id     = job->jobid;
+	r->user_id    = opt.uid;
+	r->node_count = job->nhosts;
+	/* Processor count not relevant to poe */
+	r->cpu_count  = job->nhosts;
+	r->num_tasks  = opt.nprocs;
+	r->node_list  = xstrdup(job->nodelist);
+	switch (opt.distribution) {
+	case SLURM_DIST_UNKNOWN:
+		r->task_dist = (opt.nprocs <= job->nhosts) 
+			? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK;
+		break;
+	case SLURM_DIST_CYCLIC:
+		r->task_dist = SLURM_DIST_CYCLIC;
+		break;
+	case SLURM_DIST_HOSTFILE:
+		r->task_dist = SLURM_DIST_HOSTFILE;
+		break;
+	default: /* (opt.distribution == SLURM_DIST_BLOCK) */
+		r->task_dist = SLURM_DIST_BLOCK;
+		break;
+	}
+	
+	r->network = xstrdup(opt.network);
+	if (slurmctld_comm_addr.port) {
+		r->host = xstrdup(slurmctld_comm_addr.hostname);
+		r->port = slurmctld_comm_addr.port;
+	}
+	job->step_ctx = slurm_step_ctx_create(r);
+	if (job->step_ctx == NULL) {
+		error("slurm_step_ctx_create: %s", 
+		      slurm_strerror(slurm_get_errno()));
+		return -1;
+	}
+	
+	if (slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NHOSTS, 
+			       &job->nhosts) != SLURM_SUCCESS) {
+		error("unable to get nhosts from ctx");
+	}
+	/* nhost host states */
+	job->host_state =  xmalloc(job->nhosts * sizeof(srun_host_state_t));
+	
+	if (slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_CPUS, 
+			       &job->cpus) != SLURM_SUCCESS) {
+		error("unable to get hosts from ctx");
+	}
+	
+	if (slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, 
+			       &job->stepid) != SLURM_SUCCESS) {
+		error("unable to get step id from ctx");
+	}
+	if (slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS, 
+			       &job->ntask) != SLURM_SUCCESS) {
+		error("unable to get step id from ctx");
+	}
+	job->tids   = xmalloc(job->nhosts * sizeof(uint32_t *));
+	job->host   = xmalloc(job->nhosts * sizeof(char *));
+	for(i=0;i<job->nhosts;i++) {
+		if (slurm_step_ctx_get(job->step_ctx, 
+				       SLURM_STEP_CTX_TID, i,
+				       &job->tids[i]) != SLURM_SUCCESS) {
+			error("unable to get task id %d from ctx",i);
+		}
+		if (slurm_step_ctx_get(job->step_ctx, 
+				       SLURM_STEP_CTX_HOST, i,
+				       &temp) != SLURM_SUCCESS) {
+			error("unable to get host %d from ctx", i);
+		} else 
+			job->host[i] = xstrdup(temp);		
+	}
+	if (slurm_step_ctx_get(job->step_ctx, 
+			       SLURM_STEP_CTX_CRED,
+			       &job->cred) != SLURM_SUCCESS) {
+		error("unable to get cred from ctx");
+	}
+	if (slurm_step_ctx_get(job->step_ctx, 
+			       SLURM_STEP_CTX_SWITCH_JOB,
+			       &job->switch_job) != SLURM_SUCCESS) {
+		error("unable to get switch_job from ctx");
+	}
+	slurm_free_job_step_create_request_msg(r);
+	job_update_io_fnames(job);
+}
 
 void
 update_job_state(srun_job_t *job, srun_job_state_t state)
@@ -414,99 +621,37 @@ _set_nprocs(allocation_info_t *info)
 	}
 }
 
-static srun_job_t *
-_job_create_internal(allocation_info_t *info)
+void
+job_update_io_fnames(srun_job_t *job)
 {
-	int i;
-	int cpu_cnt = 0;
-	int cpu_inx = 0;
-	hostlist_t hl;
-	srun_job_t *job;
-	eio_obj_t *obj;
-
-	/* Reset nprocs if necessary 
-	 */
-	_set_nprocs(info);
-
-	debug2("creating job with %d tasks", opt.nprocs);
-
-	job = xmalloc(sizeof(*job));
-
-	slurm_mutex_init(&job->state_mutex);
-	pthread_cond_init(&job->state_cond, NULL);
-	job->state = SRUN_JOB_INIT;
+	job->ifname = fname_create(job, opt.ifname);
+	job->ofname = fname_create(job, opt.ofname);
+	job->efname = opt.efname ? fname_create(job, opt.efname) : job->ofname;
+}
 
-	job->signaled = false;
-	job->rc       = -1;
+static void
+_job_fake_cred(srun_job_t *job)
+{
+	slurm_cred_arg_t arg;
+	arg.jobid    = job->jobid;
+	arg.stepid   = job->stepid;
+	arg.uid      = opt.uid;
+	arg.hostlist = job->nodelist;
+        arg.ntask_cnt = 0;    
+        arg.ntask    =  NULL; 
+	job->cred = slurm_cred_faker(&arg);
+}
 
-	job->nodelist = xstrdup(info->nodelist);
+static void
+_job_noalloc_step_create(srun_job_t *job, allocation_info_t *info)
+{
+	int i=0, cpu_inx=0, cpu_cnt=0;
+	hostlist_t hl;
 	hl = hostlist_create(job->nodelist);
-#ifdef HAVE_FRONT_END	/* Limited job step support */
-	/* All jobs execute through front-end on Blue Gene/L.
-	 * Normally we would not permit execution of job steps, 
-	 * but can fake it by just allocating all tasks to 
-	 * one of the allocated nodes. */
-	job->nhosts    = 1;
-	opt.overcommit = true;
-#else
-	job->nhosts = hostlist_count(hl);
-#endif
-
- 	job->select_jobinfo = info->select_jobinfo;
-	job->jobid   = info->jobid;
-	job->stepid  = info->stepid;
-	job->old_job = false;
-	job->removed = false;
-
-	/* 
-	 *  Initialize Launch and Exit timeout values
-	 */
-	job->ltimeout = 0;
-	job->etimeout = 0;
-
-	job->slurmd_addr = xmalloc(job->nhosts * sizeof(slurm_addr));
-	if (info->addrs)
-		memcpy( job->slurmd_addr, info->addrs, 
-			sizeof(slurm_addr)*job->nhosts);
 
 	job->host  = (char **) xmalloc(job->nhosts * sizeof(char *));
 	job->cpus  = (int *)   xmalloc(job->nhosts * sizeof(int) );
 
-	/* Compute number of file descriptors / Ports needed for Job 
-	 * control info server
-	 */
-	job->njfds = _estimate_nports(opt.nprocs, 48);
-	job->jfd   = (slurm_fd *)   xmalloc(job->njfds * sizeof(slurm_fd));
-	job->jaddr = (slurm_addr *) xmalloc(job->njfds * sizeof(slurm_addr));
-
-	debug3("njfds = %d", job->njfds);
-
-	/* Compute number of listening sockets needed to allow
-	 * all of the slurmds to establish IO streams with srun, without
-	 * overstressing the TCP/IP backoff/retry algorithm
-	 */
-	job->num_listen = _estimate_nports(opt.nprocs, 64);
-	job->listensock = (int *) xmalloc(job->num_listen * sizeof(int));
-	job->listenport = (int *) xmalloc(job->num_listen * sizeof(int));
-
-	job->eio = eio_handle_create();
-	job->ioservers_ready = 0;
-	/* "nhosts" number of IO protocol sockets */
-	job->ioserver = (eio_obj_t **)xmalloc(job->nhosts*sizeof(eio_obj_t *));
-	job->free_incoming = list_create(NULL); /* FIXME! Needs destructor */
-	job->incoming_count = 0;
-	job->free_outgoing = list_create(NULL); /* FIXME! Needs destructor */
-	job->outgoing_count = 0;
-
-	/* nhost host states */
-	job->host_state =  xmalloc(job->nhosts * sizeof(srun_host_state_t));
-
-	/* ntask task states and statii*/
-	job->task_state  =  xmalloc(opt.nprocs * sizeof(srun_task_state_t));
-	job->tstatus	 =  xmalloc(opt.nprocs * sizeof(int));
-
-	slurm_mutex_init(&job->task_mutex);
-
 	for(i = 0; i < job->nhosts; i++) {
 		job->host[i]  = hostlist_shift(hl);
 
@@ -517,7 +662,9 @@ _job_create_internal(allocation_info_t *info)
 			cpu_cnt = 0;
 		}
 	}
-
+	/* nhost host states */
+	job->host_state =  xmalloc(job->nhosts * sizeof(srun_host_state_t));
+	job->hostid = xmalloc(opt.nprocs * sizeof(uint32_t));
 #ifdef HAVE_FRONT_END
 		job->ntask = (int *) xmalloc(sizeof(int *));
 		job->ntask[0] = opt.nprocs;
@@ -540,14 +687,14 @@ _job_create_internal(allocation_info_t *info)
 	for (i = 0; i < job->nhosts; i++)
 		job->tids[i] = xmalloc(job->ntask[i] * sizeof(uint32_t));
 
-	if (opt.distribution == SRUN_DIST_UNKNOWN) {
+	if (opt.distribution == SLURM_DIST_UNKNOWN) {
 		if (opt.nprocs <= job->nhosts)
-			opt.distribution = SRUN_DIST_CYCLIC;
+			opt.distribution = SLURM_DIST_CYCLIC;
 		else
-			opt.distribution = SRUN_DIST_BLOCK;
+			opt.distribution = SLURM_DIST_BLOCK;
 	}
 
-	if (opt.distribution == SRUN_DIST_BLOCK)
+	if (opt.distribution == SLURM_DIST_BLOCK)
 		_dist_block(job);
 	else
 		_dist_cyclic(job);
@@ -555,33 +702,9 @@ _job_create_internal(allocation_info_t *info)
 	job_update_io_fnames(job);
 
 	hostlist_destroy(hl);
-
-	return job;
-}
-
-void
-job_update_io_fnames(srun_job_t *job)
-{
-	job->ifname = fname_create(job, opt.ifname);
-	job->ofname = fname_create(job, opt.ofname);
-	job->efname = opt.efname ? fname_create(job, opt.efname) : job->ofname;
-}
-
-static void
-_job_fake_cred(srun_job_t *job)
-{
-	slurm_cred_arg_t arg;
-	arg.jobid    = job->jobid;
-	arg.stepid   = job->stepid;
-	arg.uid      = opt.uid;
-	arg.hostlist = job->nodelist;
-        arg.ntask_cnt = 0;    
-        arg.ntask    =  NULL; 
-	job->cred = slurm_cred_faker(&arg);
+	return;
 }
 
-
-
 static char *
 _task_state_name(srun_task_state_t state_inx)
 {
@@ -1021,3 +1144,4 @@ _normalize_hostlist(const char *hostlist)
 
 	return xstrdup(buf);
 }
+
diff --git a/src/srun/srun_job.h b/src/srun/srun_job.h
index 14af524a8ce..7c3ab8eb587 100644
--- a/src/srun/srun_job.h
+++ b/src/srun/srun_job.h
@@ -116,14 +116,10 @@ typedef struct srun_job {
 	pthread_t sigid;	/* signals thread tid		  */
 
 	pthread_t jtid;		/* job control thread id 	  */
-	int njfds;		/* number of job control info fds */
 	slurm_fd *jfd;		/* job control info fd   	  */
-	slurm_addr *jaddr;	/* job control info ports 	  */
-
+	
 	pthread_t ioid;		/* stdio thread id 		  */
-	int num_listen;		/* Number of stdio listen sockets */
 	int *listensock;	/* Array of stdio listen sockets  */
-	int *listenport;	/* Array of stdio listen ports 	  */
 	eio_handle_t *eio;      /* Event IO handle                */
 	int ioservers_ready;    /* Number of servers that established contact */
 	eio_obj_t **ioserver;	/* Array of nhosts pointers to eio_obj_t */
@@ -156,16 +152,24 @@ typedef struct srun_job {
 
 	int *tstatus;	          /* ntask exit statii */
 	srun_task_state_t *task_state; /* ntask task states */
-	pthread_mutex_t task_mutex;
-
+	
 	switch_jobinfo_t switch_job;
 	io_filename_t *ifname;
 	io_filename_t *ofname;
 	io_filename_t *efname;
+	forked_msg_t *forked_msg;
+	struct slurm_step_ctx_struct *step_ctx;
+	char *task_epilog;	/* task-epilog */
+	char *task_prolog;	/* task-prolog */
+	pthread_mutex_t task_mutex;
+	int njfds;		/* number of job control info fds */
+	slurm_addr *jaddr;	/* job control info ports 	  */
+	int num_listen;		/* Number of stdio listen sockets */
+	int *listenport;	/* Array of stdio listen ports 	  */
 
 	/* Output streams and stdin fileno */
-	forked_msg_t *forked_msg;
 	select_jobinfo_t select_jobinfo;
+	
 } srun_job_t;
 
 extern int message_thread;
@@ -175,8 +179,12 @@ void    job_force_termination(srun_job_t *job);
 
 srun_job_state_t job_state(srun_job_t *job);
 
-srun_job_t * job_create_noalloc(void);
-srun_job_t * job_create_allocation(resource_allocation_response_msg_t *resp);
+extern srun_job_t * job_create_noalloc(void);
+extern srun_job_t * job_create_allocation(
+	resource_allocation_response_msg_t *resp);
+extern srun_job_t * job_create_structure(
+	resource_allocation_response_msg_t *resp);
+extern int build_step_ctx(srun_job_t *job);
 
 /*
  *  Update job filenames and modes for stderr, stdout, and stdin.
diff --git a/testsuite/expect/test1.47 b/testsuite/expect/test1.47
index 69933d6787d..36927c0bafb 100755
--- a/testsuite/expect/test1.47
+++ b/testsuite/expect/test1.47
@@ -79,7 +79,7 @@ if {$matches != 1} {
 	set exit_code 1
 }
 
-exec $bin_rm -f $file_in $file_out
+exec $bin_rm -f $file_in
 exec echo "#!$bin_bash"              >$file_in
 exec echo "#SLURM -N650000"         >>$file_in
 exec echo "$bin_sleep $delay"       >>$file_in
@@ -108,8 +108,8 @@ expect {
 # Post-processing
 #
 if {$exit_code == 0} {
-	wait_for_file $file_out
-	exec $bin_rm -f $file_in $file_out
+	exec $bin_rm -f $file_in
+	exec $bin_rm -f $file_out
 	send_user "\nSUCCESS\n"
 }
 exit $exit_code
diff --git a/testsuite/expect/test1.51 b/testsuite/expect/test1.51
index fbe6e45c6e2..13dd4109697 100755
--- a/testsuite/expect/test1.51
+++ b/testsuite/expect/test1.51
@@ -63,7 +63,7 @@ exec $bin_chmod 700 $file_in $file_script
 set matches 0
 spawn $file_script
 expect {
-	-re 0123 {
+	-re (0123|123) {
 		set matches 1
 		exp_continue
 	}
diff --git a/testsuite/expect/test1.81 b/testsuite/expect/test1.81
index 2da3e8a990e..93c91b098f6 100755
--- a/testsuite/expect/test1.81
+++ b/testsuite/expect/test1.81
@@ -45,7 +45,28 @@ set host_0      ""
 set task_cnt    0
 set can_not_run 0
 set timeout $max_job_delay
-spawn $srun -N1-1 -c 1 -l -t1 $bin_hostname -s
+
+#
+# Determine if this is AIX (for appropriate build line)
+#
+set aix 0
+spawn $bin_uname
+expect {
+        -re "AIX" {
+                set aix 1
+                exp_continue
+        }
+        eof {
+                wait
+        }
+}
+
+if {$aix == 0} {
+	set run_hostname "$bin_hostname -s"
+} else {
+	set run_hostname "$bin_hostname"	
+}
+spawn $srun -N1-1 -c 1 -l -t1 $run_hostname
 expect {
 	-re "Unable to create job step: Task count specification invalid" {
 		send_user "\nWARNING: This is not a real error for some system configurations\n"
@@ -76,7 +97,7 @@ if {[string compare $host_0 ""] == 0} {
 
 set alloc_fail 0
 set task_cnt2  0
-spawn $srun -N1-1 -w $host_0 -n [expr $task_cnt + 1] -l -t1 $bin_hostname -s
+spawn $srun -N1-1 -w $host_0 -n [expr $task_cnt + 1] -l -t1 $run_hostname 
 expect {
 	-re "($number):" {
 		incr task_cnt2
@@ -105,7 +126,7 @@ if { $task_cnt2 != 0 } {
 #
 set host_0      ""
 set host_1      ""
-spawn $srun -N1-1 -l -t1 $bin_hostname -s  
+spawn $srun -N1-1 -l -t1 $run_hostname  
 expect {
 	-re "($number): ($alpha_numeric)" {
 		if {$expect_out(1,string) == 0} {
@@ -150,7 +171,7 @@ set host_1      ""
 set host_2      ""
 set host_3      ""
 set timeout $max_job_delay
-spawn $srun -N1-3 -l -t1 $bin_hostname -s  
+spawn $srun -N1-3 -l -t1 $run_hostname 
 expect {
 	-re "($number): ($alpha_numeric)" {
 		if {$expect_out(1,string) == 0} {
@@ -212,7 +233,7 @@ set host_1      ""
 set host_2      ""
 set host_3      ""
 set timeout $max_job_delay
-spawn $srun -N2-3 -l -t1 $bin_hostname -s  
+spawn $srun -N2-3 -l -t1 $run_hostname   
 expect {
 	-re "More ($alpha) requested than permitted" {
 		send_user "\nWARNING: can't test srun task distribution\n"
diff --git a/testsuite/expect/test9.8 b/testsuite/expect/test9.8
index 8e45621b805..11469408497 100755
--- a/testsuite/expect/test9.8
+++ b/testsuite/expect/test9.8
@@ -113,6 +113,7 @@ if {$start_cnt < $job_cnt} {
 # then kill them all
 #
 set user_name ""
+
 exec $bin_sleep [expr $delay + 5]
 spawn $bin_id -un
 expect {
@@ -122,6 +123,7 @@ expect {
 	eof {
 		wait
 	}
+	
 }
 #
 # There could be hundreds of job steps, we don't want to see
@@ -145,12 +147,16 @@ expect {
 }
 log_user 0
 set matches 0
+set timeout 60
 spawn $squeue --steps --user $user_name
 expect {
 	-re "sleep" {
 		incr matches
 		exp_continue
 	}
+	timeout {
+		send_user "\nFAILURE: squeue not responding\n"
+	}
 	eof {
 		wait
 	}
-- 
GitLab