From d1f9c73edbd05670e5f0a37cf41b445abc31e04c Mon Sep 17 00:00:00 2001 From: "Christopher J. Morrone" <morrone2@llnl.gov> Date: Thu, 20 Jul 2006 22:45:15 +0000 Subject: [PATCH] Update slurm_launch_tasks to work with new location of task layout sturcture. poe may have problems with the spawn_task changes. --- src/api/spawn.c | 34 +++++++++++++++++----------------- src/api/step_ctx.c | 25 ++++++++++--------------- src/api/step_ctx.h | 3 --- src/api/step_launch.c | 12 ++++++------ src/slaunch/slaunch.c | 2 +- 5 files changed, 34 insertions(+), 42 deletions(-) diff --git a/src/api/spawn.c b/src/api/spawn.c index a8ea540766f..5a343fa8acd 100644 --- a/src/api/spawn.c +++ b/src/api/spawn.c @@ -118,8 +118,8 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) } /* validate fd_array and bind them to ports */ - sock_array = xmalloc(ctx->step_layout->num_hosts * sizeof(int)); - for (i=0; i<ctx->step_layout->num_hosts; i++) { + sock_array = xmalloc(ctx->step_resp->step_layout->num_hosts * sizeof(int)); + for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { if (fd_array[i] < 0) { slurm_seterrno(EINVAL); free(sock_array); @@ -135,14 +135,14 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) } msg_array_ptr = xmalloc(sizeof(spawn_task_request_msg_t) * - ctx->step_layout->num_hosts); + ctx->step_resp->step_layout->num_hosts); req_array_ptr = xmalloc(sizeof(slurm_msg_t) * - ctx->step_layout->num_hosts); + ctx->step_resp->step_layout->num_hosts); hostlist = hostlist_create(ctx->alloc_resp->node_list); itr = hostlist_iterator_create(hostlist); - for (i=0; i<ctx->step_layout->num_hosts; i++) { + for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { spawn_task_request_msg_t *r = &msg_array_ptr[i]; slurm_msg_t *m = &req_array_ptr[i]; @@ -156,13 +156,13 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) r->envc = ctx->envc; r->env = ctx->env; r->cwd = ctx->cwd; - r->nnodes = ctx->step_layout->num_hosts; - r->nprocs = ctx->step_layout->num_tasks; + r->nnodes = ctx->step_resp->step_layout->num_hosts; + r->nprocs = ctx->step_resp->step_layout->num_tasks; r->switch_job = ctx->step_resp->switch_job; r->slurmd_debug = slurmd_debug; /* Task specific message contents */ - r->global_task_id = ctx->step_layout->tids[i][0]; - r->cpus_allocated = ctx->step_layout->cpus[i]; + r->global_task_id = ctx->step_resp->step_layout->tids[i][0]; + r->cpus_allocated = ctx->step_resp->step_layout->cpus[i]; r->srun_node_id = (uint32_t) i; r->io_port = ntohs(sock_array[i]); m->msg_type = REQUEST_SPAWN_TASK; @@ -170,7 +170,7 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) j=0; while((host = hostlist_next(itr))) { - if(!strcmp(host,ctx->step_layout->host[i])) { + if(!strcmp(host,ctx->step_resp->step_layout->host[i])) { free(host); break; } @@ -178,14 +178,14 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) free(host); } debug2("using %d %s with %d tasks\n", j, - ctx->step_layout->host[i], + ctx->step_resp->step_layout->host[i], r->nprocs); hostlist_iterator_reset(itr); memcpy(&m->address, &ctx->alloc_resp->node_addr[j], sizeof(slurm_addr)); #if _DEBUG printf("tid=%d, fd=%d, port=%u, node_id=%u\n", - ctx->step_layout->tids[i][0], + ctx->step_resp->step_layout->tids[i][0], fd_array[i], r->io_port, i); #endif } @@ -317,7 +317,7 @@ static void _dump_ctx(slurm_step_ctx ctx) } } - for (i=0; i<ctx->step_layout->num_hosts; i++) { + for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { printf("host=%s cpus=%u tasks=%u", ctx->host[i], ctx->cpus[i], ctx->tasks[i]); for (j=0; j<ctx->tasks[i]; j++) @@ -337,19 +337,19 @@ static int _p_launch(slurm_msg_t *req, slurm_step_ctx ctx) int rc = SLURM_SUCCESS, i; thd_t *thd; - thd = xmalloc(sizeof(thd_t) * ctx->step_layout->num_hosts); + thd = xmalloc(sizeof(thd_t) * ctx->step_resp->step_layout->num_hosts); if (thd == NULL) { slurm_seterrno(ENOMEM); return SLURM_ERROR; } - for (i=0; i<ctx->step_layout->num_hosts; i++) { + for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { thd[i].state = DSH_NEW; thd[i].req = &req[i]; } /* start all the other threads (up to _MAX_THREAD_COUNT active) */ - for (i=0; i<ctx->step_layout->num_hosts; i++) { + for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { /* wait until "room" for another thread */ slurm_mutex_lock(&thread_mutex); while (threads_active >= _MAX_THREAD_COUNT) { @@ -378,7 +378,7 @@ static int _p_launch(slurm_msg_t *req, slurm_step_ctx ctx) /* wait for all tasks to terminate */ slurm_mutex_lock(&thread_mutex); - for (i=0; i<ctx->step_layout->num_hosts; i++) { + for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { while (thd[i].state < DSH_DONE) { /* wait until another thread completes*/ pthread_cond_wait(&thread_cond, &thread_mutex); diff --git a/src/api/step_ctx.c b/src/api/step_ctx.c index a177dcaf435..f1fbe740a19 100644 --- a/src/api/step_ctx.c +++ b/src/api/step_ctx.c @@ -83,19 +83,12 @@ slurm_step_ctx_create (job_step_create_request_msg_t *step_req) ctx = xmalloc(sizeof(struct slurm_step_ctx_struct)); ctx->launch_state = NULL; - ctx->step_layout = step_layout_create(alloc_resp, step_resp, step_req); - ctx->magic = STEP_CTX_MAGIC; ctx->job_id = step_req->job_id; ctx->user_id = step_req->user_id; ctx->step_req = _copy_step_req(step_req); ctx->step_resp = step_resp; ctx->alloc_resp = alloc_resp; - if (task_layout(ctx->step_layout) != SLURM_SUCCESS) { - slurm_step_ctx_destroy((slurm_step_ctx)ctx); - errno = ESLURM_BAD_DIST; - return NULL; - } return (slurm_step_ctx)ctx; } @@ -132,18 +125,20 @@ slurm_step_ctx_get (slurm_step_ctx ctx, int ctx_key, ...) break; case SLURM_STEP_CTX_TASKS: uint32_array_pptr = (uint32_t **) va_arg(ap, void *); - *uint32_array_pptr = ctx->step_layout->tasks; + *uint32_array_pptr = ctx->step_resp->step_layout->tasks; break; case SLURM_STEP_CTX_TID: node_inx = va_arg(ap, uint32_t); - if ((node_inx < 0) || (node_inx > ctx->step_layout->num_hosts)) { + if ((node_inx < 0) + || (node_inx > ctx->step_resp->step_layout->num_hosts)) { slurm_seterrno(EINVAL); rc = SLURM_ERROR; break; } uint32_array_pptr = (uint32_t **) va_arg(ap, void *); - *uint32_array_pptr = ctx->step_layout->tids[node_inx]; + *uint32_array_pptr = + ctx->step_resp->step_layout->tids[node_inx]; break; case SLURM_STEP_CTX_RESP: @@ -161,21 +156,22 @@ slurm_step_ctx_get (slurm_step_ctx ctx, int ctx_key, ...) break; case SLURM_STEP_CTX_NUM_HOSTS: uint32_ptr = (uint32_t *) va_arg(ap, void *); - *uint32_ptr = ctx->step_layout->num_hosts; + *uint32_ptr = ctx->step_resp->step_layout->num_hosts; break; case SLURM_STEP_CTX_CPUS: uint32_array_pptr = (uint32_t **) va_arg(ap, void *); - *uint32_array_pptr = ctx->step_layout->cpus; + *uint32_array_pptr = ctx->step_resp->step_layout->cpus; break; case SLURM_STEP_CTX_HOST: node_inx = va_arg(ap, uint32_t); - if ((node_inx < 0) || (node_inx > ctx->step_layout->num_hosts)) { + if ((node_inx < 0) + || (node_inx > ctx->step_resp->step_layout->num_hosts)) { slurm_seterrno(EINVAL); rc = SLURM_ERROR; break; } char_array_pptr = (char **) va_arg(ap, void *); - *char_array_pptr = ctx->step_layout->host[node_inx]; + *char_array_pptr = ctx->step_resp->step_layout->host[node_inx]; break; default: slurm_seterrno(EINVAL); @@ -276,7 +272,6 @@ slurm_step_ctx_destroy (slurm_step_ctx ctx) slurm_seterrno(EINVAL); return SLURM_ERROR; } - step_layout_destroy(ctx->step_layout); _free_step_req(ctx->step_req); slurm_free_job_step_create_response_msg(ctx->step_resp); slurm_free_resource_allocation_response_msg(ctx->alloc_resp); diff --git a/src/api/step_ctx.h b/src/api/step_ctx.h index d4700a77565..ee733b46edc 100644 --- a/src/api/step_ctx.h +++ b/src/api/step_ctx.h @@ -83,9 +83,6 @@ struct slurm_step_ctx_struct { uint32_t envc; /* count of env vars */ char **env; /* environment variables */ - slurm_step_layout_t *step_layout; /* holds info about how the task is - laid out */ - /* Used by slurm_step_launch(), but not slurm_spawn() */ struct step_launch_state *launch_state; }; diff --git a/src/api/step_launch.c b/src/api/step_launch.c index 19ba1fe0e5d..5b6895a3a85 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -198,9 +198,9 @@ int slurm_step_launch (slurm_step_ctx ctx, /* job->step_layout->tasks[i] = 1; */ /* } */ - launch.tasks_to_launch = ctx->step_layout->tasks; - launch.cpus_allocated = ctx->step_layout->cpus; - launch.global_task_ids = ctx->step_layout->tids; + launch.tasks_to_launch = ctx->step_resp->step_layout->tasks; + launch.cpus_allocated = ctx->step_resp->step_layout->cpus; + launch.global_task_ids = ctx->step_resp->step_layout->tids; ctx->launch_state->client_io = _setup_step_client_io( ctx, params->local_fds, params->labelio); @@ -559,13 +559,13 @@ static int _launch_tasks(slurm_step_ctx ctx, msg.ret_list = NULL; msg.orig_addr.sin_addr.s_addr = 0; msg.buffer = buffer; - memcpy(&msg.address, &ctx->alloc_resp->node_addr[0], + memcpy(&msg.address, &ctx->step_resp->step_layout->node_addr[0], sizeof(slurm_addr)); timeout = slurm_get_msg_timeout(); forward_set_launch(&msg.forward, - ctx->step_req->node_count, + ctx->step_resp->step_layout->num_hosts, &zero, - ctx->step_layout, + ctx->step_resp->step_layout, itr, timeout); hostlist_iterator_destroy(itr); diff --git a/src/slaunch/slaunch.c b/src/slaunch/slaunch.c index bf2556f8d05..d5575f0c8ab 100644 --- a/src/slaunch/slaunch.c +++ b/src/slaunch/slaunch.c @@ -191,7 +191,7 @@ int slaunch(int argc, char **argv) /* FIXME - don't peek into the step context, that's cheating! */ _setup_local_fds(¶ms.local_fds, (int)step_ctx->job_id, (int)step_ctx->step_resp->job_step_id, - step_ctx->step_layout); + step_ctx->step_resp->step_layout); params.parallel_debug = opt.parallel_debug ? true : false; params.task_start_callback = _task_start; params.task_finish_callback = _task_finish; -- GitLab