diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 6f835a78d22f6688a0670806d3a998bb18c380ae..318ad3404a236ab39fc13c0b4f15d098726cd0c6 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -316,7 +316,6 @@ enum ctx_keys { SLURM_STEP_CTX_CRED, SLURM_STEP_CTX_SWITCH_JOB, SLURM_STEP_CTX_NUM_HOSTS, - SLURM_STEP_CTX_CPUS, SLURM_STEP_CTX_HOST }; @@ -477,27 +476,6 @@ typedef struct job_info_msg { job_info_t *job_array; /* the job records */ } job_info_msg_t; -typedef struct slurm_step_layout { - char *nodes; /* list of nodes in step */ - char **host; /* list separated for convience sake - created from nodes */ - slurm_addr *node_addr; /* corisponding addresses */ - uint16_t num_hosts; /* node count */ - uint32_t num_tasks; /* number of tasks to execute */ - - uint16_t num_cpu_groups;/* count of cpu reps */ - uint32_t *cpus_per_node; /* consolidated list of cpus per node */ - uint32_t *cpu_count_reps; /* how many host each cpus per - node refers to */ - - uint32_t *tasks; /* number of tasks on each host */ - - uint32_t **tids; /* host id => task id mapping */ - uint32_t *hostids; /* task id => host id mapping */ - uint32_t *cpus; /* total count of processors on each host */ - -} slurm_step_layout_t; - typedef struct job_step_specs { uint32_t job_id; /* job ID */ uint32_t user_id; /* user the job runs as */ @@ -515,9 +493,13 @@ typedef struct job_step_specs { typedef struct job_step_create_response_msg { uint32_t job_step_id; /* assigned job step id */ - char *node_list; /* list of allocated nodes */ - slurm_step_layout_t *step_layout; /* info about how the tasks - are to be layout in the step */ + uint16_t node_cnt; /* number of nodes in step */ + char *node_list; /* list of nodes in step */ + slurm_addr *node_addr; /* corisponding addresses */ + uint32_t *tasks; /* number of tasks on each host */ + + uint32_t **tids; /* host id => task id mapping */ + slurm_cred_t cred; /* slurm job credential */ switch_jobinfo_t switch_job; /* switch context, opaque data structure */ } job_step_create_response_msg_t; @@ -642,7 +624,6 @@ typedef struct resource_allocation_response_msg { uint32_t *cpus_per_node;/* cpus per node */ uint32_t *cpu_count_reps;/* how many nodes have same cpu count */ uint16_t node_cnt; /* count of nodes */ - slurm_addr *node_addr; /* network addresses */ uint32_t error_code; /* error code for warning message */ select_jobinfo_t select_jobinfo; /* opaque data structure, * use select_g_get_jobinfo() to access conents */ diff --git a/src/api/spawn.c b/src/api/spawn.c index 5a343fa8acdef0f8fc6c3295dd9401b09c9338f5..89ac8c7cff0fe4459bb5d8041f3b921a309f4b32 100644 --- a/src/api/spawn.c +++ b/src/api/spawn.c @@ -92,12 +92,13 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) spawn_task_request_msg_t *msg_array_ptr; int *sock_array; slurm_msg_t *req_array_ptr; - int i, j, rc = SLURM_SUCCESS; + int i, rc = SLURM_SUCCESS; uint16_t slurmd_debug = 0; char *env_var; hostlist_t hostlist = NULL; hostlist_iterator_t itr = NULL; - char *host = NULL; + int task_cnt = 0; + uint32_t *cpus = NULL; if ((ctx == NULL) || (ctx->magic != STEP_CTX_MAGIC) || @@ -118,8 +119,8 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) } /* validate fd_array and bind them to ports */ - sock_array = xmalloc(ctx->step_resp->step_layout->num_hosts * sizeof(int)); - for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { + sock_array = xmalloc(ctx->step_resp->node_cnt * sizeof(int)); + for (i=0; i<ctx->step_resp->node_cnt; i++) { if (fd_array[i] < 0) { slurm_seterrno(EINVAL); free(sock_array); @@ -132,17 +133,19 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) return SLURM_ERROR; } listen(fd_array[i], 5); + task_cnt += ctx->step_resp->tasks[i]; } + cpus = ctx->step_resp->tasks; msg_array_ptr = xmalloc(sizeof(spawn_task_request_msg_t) * - ctx->step_resp->step_layout->num_hosts); + ctx->step_resp->node_cnt); req_array_ptr = xmalloc(sizeof(slurm_msg_t) * - ctx->step_resp->step_layout->num_hosts); + ctx->step_resp->node_cnt); hostlist = hostlist_create(ctx->alloc_resp->node_list); itr = hostlist_iterator_create(hostlist); - for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { + for (i=0; i<ctx->step_resp->node_cnt; i++) { spawn_task_request_msg_t *r = &msg_array_ptr[i]; slurm_msg_t *m = &req_array_ptr[i]; @@ -156,36 +159,23 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) r->envc = ctx->envc; r->env = ctx->env; r->cwd = ctx->cwd; - r->nnodes = ctx->step_resp->step_layout->num_hosts; - r->nprocs = ctx->step_resp->step_layout->num_tasks; + r->nnodes = ctx->step_resp->node_cnt; + r->nprocs = task_cnt; r->switch_job = ctx->step_resp->switch_job; r->slurmd_debug = slurmd_debug; /* Task specific message contents */ - r->global_task_id = ctx->step_resp->step_layout->tids[i][0]; - r->cpus_allocated = ctx->step_resp->step_layout->cpus[i]; + r->global_task_id = ctx->step_resp->tids[i][0]; + r->cpus_allocated = cpus[i]; r->srun_node_id = (uint32_t) i; r->io_port = ntohs(sock_array[i]); m->msg_type = REQUEST_SPAWN_TASK; m->data = r; - - j=0; - while((host = hostlist_next(itr))) { - if(!strcmp(host,ctx->step_resp->step_layout->host[i])) { - free(host); - break; - } - j++; - free(host); - } - debug2("using %d %s with %d tasks\n", j, - ctx->step_resp->step_layout->host[i], - r->nprocs); - hostlist_iterator_reset(itr); - memcpy(&m->address, &ctx->alloc_resp->node_addr[j], + + memcpy(&m->address, &ctx->step_resp->node_addr[i], sizeof(slurm_addr)); #if _DEBUG printf("tid=%d, fd=%d, port=%u, node_id=%u\n", - ctx->step_resp->step_layout->tids[i][0], + ctx->step_resp->tids[i][0], fd_array[i], r->io_port, i); #endif } @@ -317,7 +307,7 @@ static void _dump_ctx(slurm_step_ctx ctx) } } - for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { + for (i=0; i<ctx->step_resp->node_cnt; i++) { printf("host=%s cpus=%u tasks=%u", ctx->host[i], ctx->cpus[i], ctx->tasks[i]); for (j=0; j<ctx->tasks[i]; j++) @@ -337,19 +327,19 @@ static int _p_launch(slurm_msg_t *req, slurm_step_ctx ctx) int rc = SLURM_SUCCESS, i; thd_t *thd; - thd = xmalloc(sizeof(thd_t) * ctx->step_resp->step_layout->num_hosts); + thd = xmalloc(sizeof(thd_t) * ctx->step_resp->node_cnt); if (thd == NULL) { slurm_seterrno(ENOMEM); return SLURM_ERROR; } - for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { + for (i=0; i<ctx->step_resp->node_cnt; i++) { thd[i].state = DSH_NEW; thd[i].req = &req[i]; } /* start all the other threads (up to _MAX_THREAD_COUNT active) */ - for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { + for (i=0; i<ctx->step_resp->node_cnt; i++) { /* wait until "room" for another thread */ slurm_mutex_lock(&thread_mutex); while (threads_active >= _MAX_THREAD_COUNT) { @@ -378,7 +368,7 @@ static int _p_launch(slurm_msg_t *req, slurm_step_ctx ctx) /* wait for all tasks to terminate */ slurm_mutex_lock(&thread_mutex); - for (i=0; i<ctx->step_resp->step_layout->num_hosts; i++) { + for (i=0; i<ctx->step_resp->node_cnt; i++) { while (thd[i].state < DSH_DONE) { /* wait until another thread completes*/ pthread_cond_wait(&thread_cond, &thread_mutex); diff --git a/src/api/step_ctx.c b/src/api/step_ctx.c index 8ca2eb8fa303d36803dbc673996f4d5f557e50a5..1b4ab4b429b66cf38c024baed2a914c74842b621 100644 --- a/src/api/step_ctx.c +++ b/src/api/step_ctx.c @@ -125,20 +125,20 @@ slurm_step_ctx_get (slurm_step_ctx ctx, int ctx_key, ...) break; case SLURM_STEP_CTX_TASKS: uint32_array_pptr = (uint32_t **) va_arg(ap, void *); - *uint32_array_pptr = ctx->step_resp->step_layout->tasks; + *uint32_array_pptr = ctx->step_resp->tasks; break; case SLURM_STEP_CTX_TID: node_inx = va_arg(ap, uint32_t); if ((node_inx < 0) - || (node_inx > ctx->step_resp->step_layout->num_hosts)) { + || (node_inx > ctx->step_resp->node_cnt)) { slurm_seterrno(EINVAL); rc = SLURM_ERROR; break; } uint32_array_pptr = (uint32_t **) va_arg(ap, void *); *uint32_array_pptr = - ctx->step_resp->step_layout->tids[node_inx]; + ctx->step_resp->tids[node_inx]; break; case SLURM_STEP_CTX_RESP: @@ -156,22 +156,19 @@ slurm_step_ctx_get (slurm_step_ctx ctx, int ctx_key, ...) break; case SLURM_STEP_CTX_NUM_HOSTS: uint32_ptr = (uint32_t *) va_arg(ap, void *); - *uint32_ptr = ctx->step_resp->step_layout->num_hosts; - break; - case SLURM_STEP_CTX_CPUS: - uint32_array_pptr = (uint32_t **) va_arg(ap, void *); - *uint32_array_pptr = ctx->step_resp->step_layout->cpus; + *uint32_ptr = ctx->step_resp->node_cnt; break; case SLURM_STEP_CTX_HOST: node_inx = va_arg(ap, uint32_t); if ((node_inx < 0) - || (node_inx > ctx->step_resp->step_layout->num_hosts)) { + || (node_inx > ctx->step_resp->node_cnt)) { slurm_seterrno(EINVAL); rc = SLURM_ERROR; break; } char_array_pptr = (char **) va_arg(ap, void *); - *char_array_pptr = ctx->step_resp->step_layout->host[node_inx]; + *char_array_pptr = nodelist_nth_host(ctx->step_resp->node_list, + node_inx); break; default: slurm_seterrno(EINVAL); diff --git a/src/api/step_launch.c b/src/api/step_launch.c index 5c4d86251bfd3455a2a12001f326a91d68f30e60..3e2afba4b395a0107a6549d4f21a0bc32816e454 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -194,19 +194,20 @@ int slurm_step_launch (slurm_step_ctx ctx, /* Node specific message contents */ /* if (slurm_mpi_single_task_per_node ()) { */ -/* for (i = 0; i < job->step_layout->num_hosts; i++) */ -/* job->step_layout->tasks[i] = 1; */ +/* for (i = 0; i < job->num_hosts; i++) */ +/* job->tasks[i] = 1; */ /* } */ - launch.tasks_to_launch = ctx->step_resp->step_layout->tasks; - launch.cpus_allocated = ctx->step_resp->step_layout->cpus; - launch.global_task_ids = ctx->step_resp->step_layout->tids; + launch.tasks_to_launch = ctx->step_resp->tasks; + launch.cpus_allocated = ctx->step_resp->tasks; + launch.global_task_ids = ctx->step_resp->tids; ctx->launch_state->client_io = _setup_step_client_io( ctx, params->local_fds, params->labelio); if (ctx->launch_state->client_io == NULL) return SLURM_ERROR; - if (client_io_handler_start(ctx->launch_state->client_io) != SLURM_SUCCESS) + if (client_io_handler_start(ctx->launch_state->client_io) + != SLURM_SUCCESS) return SLURM_ERROR; launch.num_io_port = ctx->launch_state->client_io->num_listen; @@ -223,6 +224,7 @@ int slurm_step_launch (slurm_step_ctx ctx, } _launch_tasks(ctx, &launch); + xfree(launch.cpus_allocated); return SLURM_SUCCESS; } @@ -546,13 +548,14 @@ static int _launch_tasks(slurm_step_ctx ctx, msg.ret_list = NULL; msg.orig_addr.sin_addr.s_addr = 0; msg.buffer = buffer; - memcpy(&msg.address, &ctx->step_resp->step_layout->node_addr[0], + memcpy(&msg.address, &ctx->step_resp->node_addr[0], sizeof(slurm_addr)); timeout = slurm_get_msg_timeout(); forward_set_launch(&msg.forward, - ctx->step_resp->step_layout->num_hosts, + ctx->step_resp->node_cnt, &zero, - ctx->step_resp->step_layout, + ctx->step_resp->node_cnt, + ctx->step_resp->node_addr, itr, timeout); hostlist_iterator_destroy(itr); diff --git a/src/common/dist_tasks.c b/src/common/dist_tasks.c index 52c307a93cb59b0a350c0cd97368510d18cdd9d1..ce682b17ffb4059d186b3f8f407094f9d612db02 100644 --- a/src/common/dist_tasks.c +++ b/src/common/dist_tasks.c @@ -52,8 +52,16 @@ #include "src/common/read_config.h" #include "src/common/slurm_protocol_api.h" -static int _task_layout_block(slurm_step_layout_t *step_layout); -static int _task_layout_cyclic(slurm_step_layout_t *step_layout); +/* build maps for task layout on nodes */ +static int _task_layout(slurm_step_layout_t *step_layout, + const char *arbitrary_nodes, + uint32_t *cpus_per_node, uint32_t *cpu_count_reps, + uint16_t num_cpu_groups, uint16_t task_dist); + +static int _task_layout_block(slurm_step_layout_t *step_layout, + uint32_t *cpus); +static int _task_layout_cyclic(slurm_step_layout_t *step_layout, + uint32_t *cpus); #ifndef HAVE_FRONT_END static int _task_layout_hostfile(slurm_step_layout_t *step_layout, const char *arbitrary_nodes); @@ -75,10 +83,11 @@ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, * NOTE: allocates memory that should be xfreed by caller */ slurm_step_layout_t *distribute_tasks( - const char *mlist, const char *tlist, + const char *tlist, uint32_t *cpus_per_node, uint32_t *cpu_count_reps, uint16_t num_cpu_groups, - uint16_t num_hosts, uint32_t num_tasks, + uint16_t num_hosts, + uint32_t num_tasks, uint16_t task_dist) { char *arbitrary_nodes = NULL; @@ -88,49 +97,38 @@ slurm_step_layout_t *distribute_tasks( error("xmalloc error for step_layout"); return NULL; } - + if(task_dist == SLURM_DIST_ARBITRARY) { hostlist_t hl = NULL; char buf[8192]; /* set the node list for the task layout later if user supplied could be different that the job allocation */ arbitrary_nodes = xstrdup(tlist); - hl = hostlist_create(tlist); hostlist_uniq(hl); hostlist_ranged_string(hl, sizeof(buf), buf); num_hosts = hostlist_count(hl); hostlist_destroy(hl); - step_layout->nodes = xstrdup(buf); + step_layout->node_list = xstrdup(buf); } else { - step_layout->nodes = xstrdup(tlist); + step_layout->node_list = xstrdup(tlist); } - - step_layout->num_cpu_groups = num_cpu_groups; - - step_layout->cpu_count_reps = - xmalloc(sizeof(uint32_t) * num_cpu_groups); - memcpy(step_layout->cpu_count_reps, cpu_count_reps, - (sizeof(uint32_t) * num_cpu_groups)); - - step_layout->cpus_per_node = - xmalloc(sizeof(uint32_t) * num_cpu_groups); - memcpy(step_layout->cpus_per_node, cpus_per_node, - (sizeof(uint32_t) * num_cpu_groups)); - - step_layout->num_tasks = num_tasks; + + step_layout->task_cnt = num_tasks; #ifdef HAVE_FRONT_END /* Limited job step support */ /* All jobs execute through front-end on Blue Gene. * Normally we would not permit execution of job steps, * but can fake it by just allocating all tasks to * one of the allocated nodes. */ - step_layout->num_hosts = 1; + step_layout->node_cnt = 1; #else - step_layout->num_hosts = num_hosts; + step_layout->node_cnt = num_hosts; #endif - if(task_layout(step_layout, arbitrary_nodes, task_dist) + if(_task_layout(step_layout, arbitrary_nodes, + cpus_per_node, cpu_count_reps, + num_cpu_groups, task_dist) == SLURM_ERROR) { step_layout_destroy(step_layout); step_layout = NULL; @@ -138,66 +136,6 @@ slurm_step_layout_t *distribute_tasks( return step_layout; } -extern slurm_step_layout_t *step_layout_create( - resource_allocation_response_msg_t *alloc_resp, - job_step_create_response_msg_t *step_resp, - job_step_create_request_msg_t *step_req) -{ - slurm_step_layout_t *step_layout = NULL; - - step_layout = xmalloc(sizeof(slurm_step_layout_t)); - if(!step_layout) { - error("xmalloc error for step_layout"); - return NULL; - } - //step_layout->hl = NULL; - - if(alloc_resp) { - //info("got hostlist of %s", alloc_resp->node_list); - step_layout->nodes = - (char *)xstrdup(alloc_resp->node_list); - //step_layout->hl = hostlist_create(alloc_resp->node_list); - step_layout->cpus_per_node = alloc_resp->cpus_per_node; - step_layout->cpu_count_reps = alloc_resp->cpu_count_reps; -#ifdef HAVE_FRONT_END /* Limited job step support */ - /* All jobs execute through front-end on Blue Gene. - * Normally we would not permit execution of job steps, - * but can fake it by just allocating all tasks to - * one of the allocated nodes. */ - step_layout->num_hosts = 1; -#else - step_layout->num_hosts = alloc_resp->node_cnt; -#endif - step_layout->num_tasks = alloc_resp->node_cnt; - } else { - debug("no alloc_resp given for step_layout_create"); - step_layout->nodes = NULL; - step_layout->cpus_per_node = NULL; - step_layout->cpu_count_reps = NULL; - } - - if(step_resp) { - /* set the node list for the task layout later if user - supplied could be different that the job allocation */ - xfree(step_layout->nodes); - step_layout->nodes = xstrdup(step_resp->node_list); -/* info("host list is %s", step_resp->node_list); */ - } else { - debug("no step_resp given for step_layout_create"); - } - - if(step_req) { - /* this info is only in the step_req which needs to put - in the step_layout most likely the num_tasks set above - from the job allocation is incorrect now for the step */ - step_layout->num_tasks = step_req->num_tasks; - } else { - debug("no step_req given for step_layout_create"); - } - - return step_layout; -} - /* copys structure for step layout */ extern slurm_step_layout_t *step_layout_copy(slurm_step_layout_t *step_layout) { @@ -207,43 +145,20 @@ extern slurm_step_layout_t *step_layout_copy(slurm_step_layout_t *step_layout) return NULL; layout = xmalloc(sizeof(slurm_step_layout_t)); - layout->nodes = xstrdup(step_layout->nodes); - layout->num_hosts = step_layout->num_hosts; - layout->num_tasks = step_layout->num_tasks; - layout->num_cpu_groups = step_layout->num_cpu_groups; - - layout->node_addr = xmalloc(sizeof(slurm_addr) * layout->num_hosts); + layout->node_list = xstrdup(step_layout->node_list); + layout->node_cnt = step_layout->node_cnt; + layout->task_cnt = step_layout->task_cnt; + + layout->node_addr = xmalloc(sizeof(slurm_addr) * layout->node_cnt); memcpy(layout->node_addr, step_layout->node_addr, - (sizeof(slurm_addr) * layout->num_hosts)); - - layout->cpus = xmalloc(sizeof(uint32_t) * layout->num_hosts); - memcpy(layout->cpus, step_layout->cpus, - (sizeof(uint32_t) * layout->num_hosts)); + (sizeof(slurm_addr) * layout->node_cnt)); - layout->tasks = xmalloc(sizeof(uint32_t) * layout->num_hosts); + layout->tasks = xmalloc(sizeof(uint32_t) * layout->node_cnt); memcpy(layout->tasks, step_layout->tasks, - (sizeof(uint32_t) * layout->num_hosts)); - - layout->hostids = xmalloc(sizeof(uint32_t) * layout->num_tasks); - memcpy(layout->hostids, step_layout->hostids, - (sizeof(uint32_t) * layout->num_tasks)); - - layout->cpus_per_node = xmalloc(sizeof(uint32_t) - * layout->num_cpu_groups); - memcpy(layout->cpus_per_node, step_layout->cpus_per_node, - (sizeof(uint32_t) * layout->num_cpu_groups)); - - layout->cpu_count_reps = xmalloc(sizeof(uint32_t) - * layout->num_cpu_groups); - memcpy(layout->cpu_count_reps, step_layout->cpu_count_reps, - (sizeof(uint32_t) * layout->num_cpu_groups)); - - layout->host = xmalloc(sizeof(char *) * layout->num_hosts); - layout->tids = xmalloc(sizeof(uint32_t *) * layout->num_hosts); - for (i=0; i<layout->num_hosts; i++) { - layout->host[i] = malloc(strlen(step_layout->host[i])); - strcpy(layout->host[i], step_layout->host[i]); + (sizeof(uint32_t) * layout->node_cnt)); + layout->tids = xmalloc(sizeof(uint32_t *) * layout->node_cnt); + for (i=0; i<layout->node_cnt; i++) { layout->tids[i] = xmalloc(sizeof(uint32_t) * layout->tasks[i]); memcpy(layout->tids[i], step_layout->tids[i], (sizeof(uint32_t) * layout->tasks[i])); @@ -252,25 +167,83 @@ extern slurm_step_layout_t *step_layout_copy(slurm_step_layout_t *step_layout) return layout; } +extern void pack_slurm_step_layout(slurm_step_layout_t *step_layout, + Buf buffer) +{ + int i; + packstr(step_layout->node_list, buffer); + pack16(step_layout->node_cnt, buffer); + pack32(step_layout->task_cnt, buffer); + slurm_pack_slurm_addr_array(step_layout->node_addr, + step_layout->node_cnt, buffer); + + pack32_array(step_layout->tasks, step_layout->node_cnt, buffer); + for(i=0; i<step_layout->node_cnt; i++) { + pack32_array(step_layout->tids[i], step_layout->tasks[i], + buffer); + } +} + +extern int unpack_slurm_step_layout(slurm_step_layout_t **layout, Buf buffer) +{ + uint16_t uint16_tmp; + uint32_t uint32_tmp; + slurm_step_layout_t *step_layout; + int i; + + step_layout = xmalloc(sizeof(slurm_step_layout_t)); + *layout = step_layout; + + step_layout->node_list = NULL; + step_layout->node_cnt = 0; + step_layout->tids = NULL; + step_layout->tasks = NULL; + safe_unpackstr_xmalloc(&step_layout->node_list, &uint16_tmp, buffer); + safe_unpack16(&step_layout->node_cnt, buffer); + safe_unpack32(&step_layout->task_cnt, buffer); + + if (slurm_unpack_slurm_addr_array(&(step_layout->node_addr), + &uint16_tmp, buffer)) + goto unpack_error; + if (uint16_tmp != step_layout->node_cnt) + goto unpack_error; + + safe_unpack32_array(&(step_layout->tasks), + &uint32_tmp, buffer); + if (uint32_tmp != step_layout->node_cnt) + goto unpack_error; + + step_layout->tids = xmalloc(sizeof(uint32_t *) + * step_layout->node_cnt); + + for(i=0; i<step_layout->node_cnt; i++) { + safe_unpack32_array(&(step_layout->tids[i]), + &uint32_tmp, + buffer); + if (uint32_tmp != step_layout->tasks[i]) + goto unpack_error; + } + + return SLURM_SUCCESS; + +unpack_error: + step_layout_destroy(step_layout); + *layout = NULL; + return SLURM_ERROR; +} + /* destroys structure for step layout */ extern int step_layout_destroy(slurm_step_layout_t *step_layout) { int i=0; if(step_layout) { - xfree(step_layout->nodes); + xfree(step_layout->node_list); xfree(step_layout->node_addr); - for (i=0; i<step_layout->num_hosts; i++) { - if(step_layout->host && step_layout->host[i]) - free(step_layout->host[i]); + for (i=0; i<step_layout->node_cnt; i++) { xfree(step_layout->tids[i]); } - xfree(step_layout->host); xfree(step_layout->tids); - xfree(step_layout->cpus); xfree(step_layout->tasks); - xfree(step_layout->hostids); - xfree(step_layout->cpus_per_node); - xfree(step_layout->cpu_count_reps); xfree(step_layout); } @@ -279,59 +252,58 @@ extern int step_layout_destroy(slurm_step_layout_t *step_layout) } /* build maps for task layout on nodes */ -extern int task_layout(slurm_step_layout_t *step_layout, - const char *arbitrary_nodes, - uint16_t task_dist) +static int _task_layout(slurm_step_layout_t *step_layout, + const char *arbitrary_nodes, + uint32_t *cpus_per_node, uint32_t *cpu_count_reps, + uint16_t num_cpu_groups, uint16_t task_dist) { int cpu_cnt = 0, cpu_inx = 0, i; hostlist_t hl = NULL; - - if (step_layout->num_hosts == 0) + char *name = NULL; + uint32_t cpus[step_layout->node_cnt]; + + if (step_layout->node_cnt == 0) return SLURM_ERROR; - if (step_layout->cpus) /* layout already completed */ + if (step_layout->tasks) /* layout already completed */ return SLURM_SUCCESS; step_layout->node_addr = xmalloc(sizeof(slurm_addr) - * step_layout->num_hosts); - step_layout->cpus = xmalloc(sizeof(uint32_t) - * step_layout->num_hosts); + * step_layout->node_cnt); step_layout->tasks = xmalloc(sizeof(uint32_t) - * step_layout->num_hosts); - step_layout->host = xmalloc(sizeof(char *) - * step_layout->num_hosts); + * step_layout->node_cnt); step_layout->tids = xmalloc(sizeof(uint32_t *) - * step_layout->num_hosts); - step_layout->hostids = xmalloc(sizeof(uint32_t) - * step_layout->num_tasks); - - hl = hostlist_create(step_layout->nodes); + * step_layout->node_cnt); + + hl = hostlist_create(step_layout->node_list); /* make sure the number of nodes we think we have is the correct number */ i = hostlist_count(hl); - if(step_layout->num_hosts > i) - step_layout->num_hosts = i; + if(step_layout->node_cnt > i) + step_layout->node_cnt = i; debug("laying out the %d tasks on %d hosts %s\n", - step_layout->num_tasks, step_layout->num_hosts, - step_layout->nodes); - if(step_layout->num_hosts < 1) { + step_layout->task_cnt, step_layout->node_cnt, + step_layout->node_list); + if(step_layout->node_cnt < 1) { error("no hostlist given can't layout tasks"); hostlist_destroy(hl); return SLURM_ERROR; } - for (i=0; i<step_layout->num_hosts; i++) { - step_layout->host[i] = hostlist_shift(hl); - if(!step_layout->host[i]) { + + for (i=0; i<step_layout->node_cnt; i++) { + name = hostlist_shift(hl); + if(!name) { error("hostlist incomplete for this job request"); hostlist_destroy(hl); return SLURM_ERROR; } - slurm_conf_get_addr(step_layout->host[i], + slurm_conf_get_addr(name, &step_layout->node_addr[i]); - debug2("host %d = %s", i, step_layout->host[i]); - step_layout->cpus[i] = step_layout->cpus_per_node[cpu_inx]; + debug2("host %d = %s", i, name); + free(name); + cpus[i] = cpus_per_node[cpu_inx]; - if ((++cpu_cnt) >= step_layout->cpu_count_reps[cpu_inx]) { + if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) { /* move to next record */ cpu_inx++; cpu_cnt = 0; @@ -339,39 +311,50 @@ extern int task_layout(slurm_step_layout_t *step_layout, } hostlist_destroy(hl); - if(step_layout->num_cpu_groups != cpu_inx) { + if(num_cpu_groups != cpu_inx) { info("we got %d cpu groups but was looking for %d", - cpu_inx, step_layout->num_cpu_groups); + cpu_inx, num_cpu_groups); } if (task_dist == SLURM_DIST_CYCLIC) - return _task_layout_cyclic(step_layout); + return _task_layout_cyclic(step_layout, cpus); #ifndef HAVE_FRONT_END else if(task_dist == SLURM_DIST_ARBITRARY) return _task_layout_hostfile(step_layout, arbitrary_nodes); #endif else - return _task_layout_block(step_layout); + return _task_layout_block(step_layout, cpus); } -int -step_layout_host_id (slurm_step_layout_t *s, int taskid) +int step_layout_host_id (slurm_step_layout_t *s, int taskid) { - if (taskid > s->num_tasks - 1) + int i, j; + if (taskid > s->task_cnt - 1) return SLURM_ERROR; + for (i=0; i < s->node_cnt; i++) + for (j=0; j<s->tasks[i]; j++) + if(s->tids[i][j] == taskid) + return i; - return (s->hostids[taskid]); + return SLURM_ERROR; } -char * -step_layout_host_name (slurm_step_layout_t *s, int taskid) +char *step_layout_host_name (slurm_step_layout_t *s, int taskid) { int hostid = step_layout_host_id (s, taskid); - + if (hostid < 0) return NULL; - return (s->host[hostid]); + return nodelist_nth_host(s->node_list, hostid); +} + +char *nodelist_nth_host(const char *nodelist, int inx) +{ + hostlist_t hl = hostlist_create(nodelist); + char *name = hostlist_nth(hl, inx); + hostlist_destroy(hl); + return name; } #ifndef HAVE_FRONT_END @@ -388,15 +371,15 @@ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, hostlist_t job_alloc_hosts = NULL; hostlist_t step_alloc_hosts = NULL; - debug2("job list is %s", step_layout->nodes); - job_alloc_hosts = hostlist_create(step_layout->nodes); + debug2("job list is %s", step_layout->node_list); + job_alloc_hosts = hostlist_create(step_layout->node_list); itr = hostlist_iterator_create(job_alloc_hosts); debug2("list is %s", arbitrary_nodes); step_alloc_hosts = hostlist_create(arbitrary_nodes); - if(hostlist_count(step_alloc_hosts) != step_layout->num_tasks) { + if(hostlist_count(step_alloc_hosts) != step_layout->task_cnt) { error("Asked for %d tasks have %d in the nodelist. " "Check your nodelist", - step_layout->num_tasks, + step_layout->task_cnt, hostlist_count(step_alloc_hosts)); return SLURM_ERROR; } @@ -409,7 +392,7 @@ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, task_cnt++; } free(host_task); - if(task_cnt >= step_layout->num_tasks) + if(task_cnt >= step_layout->task_cnt) break; } debug3("%s got %d tasks\n", @@ -425,7 +408,6 @@ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, while((host_task = hostlist_next(itr_task))) { if(!strcmp(host, host_task)) { step_layout->tids[i][j] = taskid; - step_layout->hostids[taskid] = i; j++; } taskid++; @@ -437,16 +419,16 @@ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, reset_hosts: hostlist_iterator_reset(itr_task); free(host); - if(i > step_layout->num_tasks) + if(i > step_layout->task_cnt) break; } hostlist_iterator_destroy(itr); hostlist_iterator_destroy(itr_task); hostlist_destroy(job_alloc_hosts); hostlist_destroy(step_alloc_hosts); - if(task_cnt != step_layout->num_tasks) { + if(task_cnt != step_layout->task_cnt) { error("Asked for %d tasks but placed %d. Check your nodelist", - step_layout->num_tasks, task_cnt); + step_layout->task_cnt, task_cnt); return SLURM_ERROR; } @@ -457,20 +439,20 @@ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, /* to effectively deal with heterogeneous nodes, we fake a cyclic * distribution to figure out how many tasks go on each node and * then make those assignments in a block fashion */ -static int _task_layout_block(slurm_step_layout_t *step_layout) +static int _task_layout_block(slurm_step_layout_t *step_layout, uint32_t *cpus) { int i, j, taskid = 0; bool over_subscribe = false; /* figure out how many tasks go to each node */ - for (j=0; (taskid<step_layout->num_tasks); j++) { /* cycle counter */ + for (j=0; (taskid<step_layout->task_cnt); j++) { /* cycle counter */ bool space_remaining = false; - for (i=0; ((i<step_layout->num_hosts) - && (taskid<step_layout->num_tasks)); i++) { - if ((j<step_layout->cpus[i]) || over_subscribe) { + for (i=0; ((i<step_layout->node_cnt) + && (taskid<step_layout->task_cnt)); i++) { + if ((j<cpus[i]) || over_subscribe) { taskid++; step_layout->tasks[i]++; - if ((j+1) < step_layout->cpus[i]) + if ((j+1) < cpus[i]) space_remaining = true; } } @@ -480,7 +462,7 @@ static int _task_layout_block(slurm_step_layout_t *step_layout) /* now distribute the tasks */ taskid = 0; - for (i=0; i < step_layout->num_hosts; i++) { + for (i=0; i < step_layout->node_cnt; i++) { step_layout->tids[i] = xmalloc(sizeof(uint32_t) * step_layout->tasks[i]); if (step_layout->tids[i] == NULL) { @@ -489,7 +471,6 @@ static int _task_layout_block(slurm_step_layout_t *step_layout) } for (j=0; j<step_layout->tasks[i]; j++) { step_layout->tids[i][j] = taskid; - step_layout->hostids[taskid] = i; taskid++; } } @@ -509,30 +490,30 @@ static int _task_layout_block(slurm_step_layout_t *step_layout) * 10 11 all processors allocated now * 12 13 14 15 etc. */ -static int _task_layout_cyclic(slurm_step_layout_t *step_layout) +static int _task_layout_cyclic(slurm_step_layout_t *step_layout, + uint32_t *cpus) { int i, j, taskid = 0; bool over_subscribe = false; - for (i=0; i<step_layout->num_hosts; i++) { + for (i=0; i<step_layout->node_cnt; i++) { step_layout->tids[i] = xmalloc(sizeof(uint32_t) - * step_layout->num_tasks); + * step_layout->task_cnt); if (step_layout->tids[i] == NULL) { slurm_seterrno(ENOMEM); return SLURM_ERROR; } } - for (j=0; taskid<step_layout->num_tasks; j++) { /* cycle counter */ + for (j=0; taskid<step_layout->task_cnt; j++) { /* cycle counter */ bool space_remaining = false; - for (i=0; ((i<step_layout->num_hosts) - && (taskid<step_layout->num_tasks)); i++) { - if ((j<step_layout->cpus[i]) || over_subscribe) { + for (i=0; ((i<step_layout->node_cnt) + && (taskid<step_layout->task_cnt)); i++) { + if ((j<cpus[i]) || over_subscribe) { step_layout->tids[i][step_layout->tasks[i]] = taskid; - step_layout->hostids[taskid] = i; taskid++; step_layout->tasks[i]++; - if ((j+1) < step_layout->cpus[i]) + if ((j+1) < cpus[i]) space_remaining = true; } } @@ -542,3 +523,4 @@ static int _task_layout_cyclic(slurm_step_layout_t *step_layout) return SLURM_SUCCESS; } + diff --git a/src/common/dist_tasks.h b/src/common/dist_tasks.h index b54769d97ad2bcc361f31f93cc7377661923f0f6..b073baab760855884a5e7b04a0a26085c4b4db91 100644 --- a/src/common/dist_tasks.h +++ b/src/common/dist_tasks.h @@ -44,6 +44,19 @@ #endif #include "src/common/hostlist.h" +#include "src/common/pack.h" + +typedef struct slurm_step_layout { + char *node_list; /* list of nodes in step */ + slurm_addr *node_addr; /* corisponding addresses */ + uint16_t node_cnt; /* node count */ + uint32_t task_cnt; /* number of tasks to execute */ + + uint32_t *tasks; /* number of tasks on each host + & num of cpus on each host allocated */ + uint32_t **tids; /* host id => task id mapping */ +} slurm_step_layout_t; + /* * distribute_tasks - determine how many tasks of a job will be run on each. @@ -59,29 +72,28 @@ * RET a pointer to an integer array listing task counts per node * NOTE: allocates memory that should be xfreed by caller */ -slurm_step_layout_t *distribute_tasks(const char *mlist, - const char *tlist, - uint32_t *cpus_per_node, - uint32_t *cpu_count_reps, - uint16_t num_cpu_groups, - uint16_t num_hosts, - uint32_t num_tasks, - uint16_t task_dist); +extern slurm_step_layout_t *distribute_tasks(const char *tlist, + uint32_t *cpus_per_node, + uint32_t *cpu_count_reps, + uint16_t num_cpu_groups, + uint16_t num_hosts, + uint32_t num_tasks, + uint16_t task_dist); -/* creates structure for step layout */ -extern slurm_step_layout_t *step_layout_create( - resource_allocation_response_msg_t *alloc_resp, - job_step_create_response_msg_t *step_resp, - job_step_create_request_msg_t *step_req); +/* copys structure for step layout */ extern slurm_step_layout_t *step_layout_copy(slurm_step_layout_t *step_layout); + +/* pack and unpack structure */ +extern void pack_slurm_step_layout(slurm_step_layout_t *step_layout, + Buf buffer); +extern int unpack_slurm_step_layout(slurm_step_layout_t **layout, Buf buffer); + /* destroys structure for step layout */ extern int step_layout_destroy(slurm_step_layout_t *step_layout); -/* build maps for task layout on nodes */ -extern int task_layout(slurm_step_layout_t *step_layout, - const char *arbitrary_nodes, uint16_t task_dist); extern int step_layout_host_id (slurm_step_layout_t *s, int taskid); -extern char * step_layout_host_name (slurm_step_layout_t *s, int hostid); - +extern char *step_layout_host_name (slurm_step_layout_t *s, int hostid); +extern char *nodelist_nth_host(const char *nodelist, int inx); + #endif /* !_DIST_TASKS_H */ diff --git a/src/common/forward.c b/src/common/forward.c index 79f5fd75d7bffd51188c41c620fa41e791deee85..6bac87c81e152d9f01da244be91592479fb7404c 100644 --- a/src/common/forward.c +++ b/src/common/forward.c @@ -48,8 +48,6 @@ #endif /* WITH_PTHREADS */ #define MAX_RETRIES 3 -int _destroy_data_info_data(uint32_t type, ret_data_info_t *ret_data_info); - void *_forward_thread(void *arg) { @@ -224,139 +222,6 @@ cleanup: return (NULL); } -int _destroy_data_info_data(uint32_t type, ret_data_info_t *ret_data_info) -{ - switch(type) { - case REQUEST_BUILD_INFO: - slurm_free_last_update_msg(ret_data_info->data); - break; - case REQUEST_JOB_INFO: - slurm_free_job_info_request_msg(ret_data_info->data); - break; - case REQUEST_NODE_INFO: - slurm_free_node_info_request_msg(ret_data_info->data); - break; - case REQUEST_PARTITION_INFO: - slurm_free_part_info_request_msg(ret_data_info->data); - break; - case MESSAGE_EPILOG_COMPLETE: - slurm_free_epilog_complete_msg(ret_data_info->data); - break; - case REQUEST_CANCEL_JOB_STEP: - slurm_free_job_step_kill_msg(ret_data_info->data); - break; - case REQUEST_COMPLETE_JOB_ALLOCATION: - slurm_free_complete_job_allocation_msg(ret_data_info->data); - break; - case REQUEST_COMPLETE_BATCH_SCRIPT: - slurm_free_complete_batch_script_msg(ret_data_info->data); - break; - case REQUEST_JOB_STEP_CREATE: - slurm_free_job_step_create_request_msg(ret_data_info->data); - break; - case REQUEST_JOB_STEP_INFO: - slurm_free_job_step_info_request_msg(ret_data_info->data); - break; - case REQUEST_RESOURCE_ALLOCATION: - case REQUEST_JOB_WILL_RUN: - case REQUEST_SUBMIT_BATCH_JOB: - case REQUEST_UPDATE_JOB: - slurm_free_job_desc_msg(ret_data_info->data); - break; - case MESSAGE_NODE_REGISTRATION_STATUS: - slurm_free_node_registration_status_msg(ret_data_info->data); - break; - case REQUEST_JOB_END_TIME: - case REQUEST_JOB_ALLOCATION_INFO: - slurm_free_job_alloc_info_msg(ret_data_info->data); - break; - case SLURM_SUCCESS: - case REQUEST_PING: - case REQUEST_RECONFIGURE: - case REQUEST_CONTROL: - case REQUEST_SHUTDOWN_IMMEDIATE: - /* No body to free */ - break; - case REQUEST_SHUTDOWN: - slurm_free_shutdown_msg(ret_data_info->data); - break; - case REQUEST_UPDATE_NODE: - slurm_free_update_node_msg(ret_data_info->data); - break; - case REQUEST_UPDATE_PARTITION: - slurm_free_update_part_msg(ret_data_info->data); - break; - case REQUEST_DELETE_PARTITION: - slurm_free_delete_part_msg(ret_data_info->data); - break; - case REQUEST_NODE_REGISTRATION_STATUS: - slurm_free_node_registration_status_msg(ret_data_info->data); - break; - case REQUEST_CHECKPOINT: - slurm_free_checkpoint_msg(ret_data_info->data); - break; - case REQUEST_CHECKPOINT_COMP: - slurm_free_checkpoint_comp_msg(ret_data_info->data); - break; - case REQUEST_SUSPEND: - slurm_free_suspend_msg(ret_data_info->data); - break; - case REQUEST_JOB_READY: - slurm_free_job_id_msg(ret_data_info->data); - break; - case REQUEST_NODE_SELECT_INFO: - slurm_free_node_select_msg(ret_data_info->data); - break; - case REQUEST_STEP_COMPLETE: - slurm_free_step_complete_msg(ret_data_info->data); - break; - case MESSAGE_STAT_JOBACCT: - slurm_free_stat_jobacct_msg(ret_data_info->data); - break; - case REQUEST_BATCH_JOB_LAUNCH: - slurm_free_job_launch_msg(ret_data_info->data); - break; - case REQUEST_LAUNCH_TASKS: - slurm_free_launch_tasks_request_msg(ret_data_info->data); - break; - case REQUEST_SPAWN_TASK: - slurm_free_spawn_task_request_msg(ret_data_info->data); - break; - case REQUEST_SIGNAL_TASKS: - case REQUEST_TERMINATE_TASKS: - slurm_free_kill_tasks_msg(ret_data_info->data); - break; - case REQUEST_KILL_TIMELIMIT: - slurm_free_timelimit_msg(ret_data_info->data); - break; - case REQUEST_REATTACH_TASKS: - slurm_free_reattach_tasks_request_msg(ret_data_info->data); - break; - case REQUEST_SIGNAL_JOB: - slurm_free_signal_job_msg(ret_data_info->data); - break; - case REQUEST_TERMINATE_JOB: - slurm_free_kill_job_msg(ret_data_info->data); - break; - case REQUEST_UPDATE_JOB_TIME: - slurm_free_update_job_time_msg(ret_data_info->data); - break; - case REQUEST_JOB_ID: - slurm_free_job_id_request_msg(ret_data_info->data); - break; - case REQUEST_FILE_BCAST: - slurm_free_file_bcast_msg(ret_data_info->data); - break; - case RESPONSE_SLURM_RC: - slurm_free_return_code_msg(ret_data_info->data); - break; - default: - error("invalid FORWARD ret_type=%u", type); - break; - } - return SLURM_SUCCESS; -} - /* * forward_init - initilize forward structure * IN: forward - forward_t * - struct to store forward info @@ -608,8 +473,9 @@ extern int forward_set(forward_t *forward, * a job launch * IN: forward - forward_t * - struct to store forward info * IN: span - int - count of forwards to do - * IN: step_layout - slurm_step_layout_t * - contains information about hosts - * from original message + * IN: pos - int * - position in the node list + * IN: total - int - total count of nodes in message + * IN: node_addr - slurm_addr * - contains addresses to forward to * IN: itr - hostlist_iterator_t - count into host list of hosts to * send messages to * IN: timeout - int32_t - timeout if any to wait for @@ -619,20 +485,14 @@ extern int forward_set(forward_t *forward, extern int forward_set_launch(forward_t *forward, int span, int *pos, - slurm_step_layout_t *step_layout, + int total, + slurm_addr *node_addr, hostlist_iterator_t itr, int32_t timeout) { int j=1; char *host = NULL; - int total = step_layout->num_hosts; - - /* char name[MAX_SLURM_NAME]; */ -/* strncpy(name, */ -/* step_layout->host[*pos], */ -/* MAX_SLURM_NAME); */ -/* info("forwarding to %s",name); */ if(span > 0) { forward->addr = xmalloc(sizeof(slurm_addr) * span); @@ -646,7 +506,7 @@ extern int forward_set_launch(forward_t *forward, host = hostlist_next(itr); memcpy(&forward->addr[j-1], - &step_layout->node_addr[(*pos+j)], + &node_addr[(*pos+j)], sizeof(slurm_addr)); strcpy(&forward->name[(j-1) * MAX_SLURM_NAME], host); forward->node_id[j-1] = (*pos+j); @@ -780,8 +640,8 @@ void destroy_ret_types(void *object) if(ret_type->ret_data_list) { while((ret_data_info = list_pop(ret_type->ret_data_list))) { - _destroy_data_info_data(ret_type->type, - ret_data_info); + slurm_free_msg_data(ret_type->type, + ret_data_info->data); destroy_data_info(ret_data_info); } list_destroy(ret_type->ret_data_list); diff --git a/src/common/forward.h b/src/common/forward.h index 180e88fb6c2d75669501866710bea5255f1f06ed..c567c84e50fb25e7ab79a8b649fe71c49a600509 100644 --- a/src/common/forward.h +++ b/src/common/forward.h @@ -157,8 +157,9 @@ extern int forward_set (forward_t *forward, * a job launch * IN: forward - forward_t * - struct to store forward info * IN: span - int - count of forwards to do - * IN: step_layout - slurm_step_layout_t * - contains information about hosts - * from original message + * IN: pos - int * - position in the node list + * IN: total - int - total count of nodes in message + * IN: node_addr - slurm_addr * - contains addresses to forward to * IN: itr - hostlist_iterator_t - count into host list of hosts to * send messages to * IN: timeout - int32_t - timeout if any to wait for @@ -200,7 +201,8 @@ while((host = hostlist_next(itr)) != NULL) { forward_set_launch(&m->forward, span[job->thr_count], &i, - job->step_layout, + job->step_layout->node_cnt, + job->step_layout->node_addr, itr, opt.msg_timeout); //increment the count of threads created @@ -214,7 +216,8 @@ hostlist_destroy(hostlist); extern int forward_set_launch (forward_t *forward, int span, int *pos, - slurm_step_layout_t *step_layout, + int total, + slurm_addr * node_addr, hostlist_iterator_t itr, int32_t timeout); diff --git a/src/common/global_srun.c b/src/common/global_srun.c index 991730c2b59f72f5b37a3632975445a6e0bac3fc..d723d92657277e4e00d1e88fe801a47b69da87f1 100644 --- a/src/common/global_srun.c +++ b/src/common/global_srun.c @@ -37,6 +37,7 @@ #include <string.h> #include <slurm/slurm_errno.h> +#include <stdlib.h> #include "src/common/log.h" #include "src/common/macros.h" @@ -109,8 +110,10 @@ fwd_signal(srun_job_t *job, int signo, int max_threads) for (i = 0; i < job->nhosts; i++) { if (job->host_state[i] != SRUN_HOST_REPLIED) { - debug2("%s has not yet replied\n", - job->step_layout->host[i]); + char *name = nodelist_nth_host( + job->step_layout->node_list, i); + debug2("%s has not yet replied\n", name); + free(name); continue; } if (job_active_tasks_on_host(job, i) == 0) @@ -198,7 +201,8 @@ static void * _p_signal_task(void *args) task_info_t *info = (task_info_t *)args; slurm_msg_t *req = info->req_ptr; srun_job_t *job = info->job_ptr; - char *host = job->step_layout->host[info->host_inx]; + char *host = nodelist_nth_host(job->step_layout->node_list, + info->host_inx); char *tmpchar = NULL; List ret_list = NULL; ListIterator itr; @@ -275,6 +279,7 @@ done: active--; pthread_cond_signal(&active_cond); slurm_mutex_unlock(&active_mutex); + free(host); xfree(args); return NULL; } diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 7baabf5edaeb18e19fea9691da43ec819371ca6a..4b10c46a198c7bc678ec8aa9f77fff10eb3f6e20 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -1290,10 +1290,10 @@ void slurm_pack_slurm_addr(slurm_addr * slurm_address, Buf buffer) _slurm_pack_slurm_addr(slurm_address, buffer); } -/* slurm_pack_slurm_addr +/* slurm_unpack_slurm_addr * unpacks a buffer into a slurm_addr after serialization transport * OUT slurm_address - slurm_addr to unpack to - * IN/OUT buffer - buffer to upack the slurm_addr from + * IN/OUT buffer - buffer to unpack the slurm_addr from * returns - SLURM error code */ int slurm_unpack_slurm_addr_no_alloc(slurm_addr * slurm_address, @@ -1302,6 +1302,59 @@ int slurm_unpack_slurm_addr_no_alloc(slurm_addr * slurm_address, return _slurm_unpack_slurm_addr_no_alloc(slurm_address, buffer); } +/* slurm_pack_slurm_addr_array + * packs an array of slurm_addrs into a buffer + * OUT slurm_address - slurm_addr to pack + * IN size_val - how many to pack + * IN/OUT buffer - buffer to pack the slurm_addr from + * returns - SLURM error code + */ +void slurm_pack_slurm_addr_array(slurm_addr * slurm_address, + uint16_t size_val, Buf buffer) +{ + int i = 0; + uint16_t nl = htons(size_val); + pack16((uint16_t)nl, buffer); + + for (i = 0; i < size_val; i++) { + slurm_pack_slurm_addr(slurm_address + i, buffer); + } + +} + +/* slurm_unpack_slurm_addr_array + * unpacks an array of slurm_addrs from a buffer + * OUT slurm_address - slurm_addr to unpack to + * IN size_val - how many to unpack + * IN/OUT buffer - buffer to upack the slurm_addr from + * returns - SLURM error code + */ +int slurm_unpack_slurm_addr_array(slurm_addr ** slurm_address, + uint16_t * size_val, Buf buffer) +{ + int i = 0; + uint16_t nl; + + *slurm_address = NULL; + safe_unpack16(&nl, buffer); + *size_val = ntohs(nl); + *slurm_address = xmalloc((*size_val) * sizeof(slurm_addr)); + + for (i = 0; i < *size_val; i++) { + if (slurm_unpack_slurm_addr_no_alloc((*slurm_address) + i, + buffer)) + goto unpack_error; + + } + return SLURM_SUCCESS; + +unpack_error: + xfree(*slurm_address); + *slurm_address = NULL; + return SLURM_ERROR; +} + + /**********************************************************************\ * simplified communication routines * They open a connection do work then close the connection all within diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 8b266548134a1e7c6b6b8864460b687004434efc..e37ad02ecd78fc89930ddebfd5d6e59e78473579 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -512,6 +512,25 @@ void inline slurm_pack_slurm_addr(slurm_addr * slurm_address, Buf buffer); int inline slurm_unpack_slurm_addr_no_alloc(slurm_addr * slurm_address, Buf buffer); +/* slurm_pack_slurm_addr_array + * packs an array of slurm_addrs into a buffer + * OUT slurm_address - slurm_addr to pack + * IN size_val - how many to pack + * IN/OUT buffer - buffer to pack the slurm_addr from + * returns - SLURM error code + */ +void inline slurm_pack_slurm_addr_array(slurm_addr * slurm_address, + uint16_t size_val, Buf buffer); +/* slurm_unpack_slurm_addr_array + * unpacks an array of slurm_addrs from a buffer + * OUT slurm_address - slurm_addr to unpack to + * IN size_val - how many to unpack + * IN/OUT buffer - buffer to upack the slurm_addr from + * returns - SLURM error code + */ +int inline slurm_unpack_slurm_addr_array(slurm_addr ** slurm_address, + uint16_t * size_val, Buf buffer); + /**********************************************************************\ * simplified communication routines * They open a connection do work then close the connection all within diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 15f36d8215b4041d4cc566cbfcc9a5a1bb48e5a4..65aac57f8c9e09472dc8884a16df3757adabfb64 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -678,7 +678,6 @@ void slurm_free_resource_allocation_response_msg ( xfree(msg->node_list); xfree(msg->cpus_per_node); xfree(msg->cpu_count_reps); - xfree(msg->node_addr); xfree(msg); } } @@ -711,10 +710,18 @@ void slurm_free_job_alloc_info_response_msg(job_alloc_info_response_msg_t *msg) void slurm_free_job_step_create_response_msg( job_step_create_response_msg_t * msg) { + int i; if (msg) { slurm_cred_destroy(msg->cred); - step_layout_destroy(msg->step_layout); - + xfree(msg->node_list); + xfree(msg->node_addr); + for (i=0; i<msg->node_cnt; i++) { + xfree(msg->tids[i]); + } + + xfree(msg->tasks); + xfree(msg->tids); + if (msg->switch_job) switch_free_jobinfo(msg->switch_job); @@ -979,3 +986,138 @@ void inline slurm_free_node_select_msg( { xfree(msg); } + + +extern int slurm_free_msg_data(uint32_t type, void *data) +{ + switch(type) { + case REQUEST_BUILD_INFO: + slurm_free_last_update_msg(data); + break; + case REQUEST_JOB_INFO: + slurm_free_job_info_request_msg(data); + break; + case REQUEST_NODE_INFO: + slurm_free_node_info_request_msg(data); + break; + case REQUEST_PARTITION_INFO: + slurm_free_part_info_request_msg(data); + break; + case MESSAGE_EPILOG_COMPLETE: + slurm_free_epilog_complete_msg(data); + break; + case REQUEST_CANCEL_JOB_STEP: + slurm_free_job_step_kill_msg(data); + break; + case REQUEST_COMPLETE_JOB_ALLOCATION: + slurm_free_complete_job_allocation_msg(data); + break; + case REQUEST_COMPLETE_BATCH_SCRIPT: + slurm_free_complete_batch_script_msg(data); + break; + case REQUEST_JOB_STEP_CREATE: + slurm_free_job_step_create_request_msg(data); + break; + case REQUEST_JOB_STEP_INFO: + slurm_free_job_step_info_request_msg(data); + break; + case REQUEST_RESOURCE_ALLOCATION: + case REQUEST_JOB_WILL_RUN: + case REQUEST_SUBMIT_BATCH_JOB: + case REQUEST_UPDATE_JOB: + slurm_free_job_desc_msg(data); + break; + case MESSAGE_NODE_REGISTRATION_STATUS: + slurm_free_node_registration_status_msg(data); + break; + case REQUEST_JOB_END_TIME: + case REQUEST_JOB_ALLOCATION_INFO: + slurm_free_job_alloc_info_msg(data); + break; + case SLURM_SUCCESS: + case REQUEST_PING: + case REQUEST_RECONFIGURE: + case REQUEST_CONTROL: + case REQUEST_SHUTDOWN_IMMEDIATE: + /* No body to free */ + break; + case REQUEST_SHUTDOWN: + slurm_free_shutdown_msg(data); + break; + case REQUEST_UPDATE_NODE: + slurm_free_update_node_msg(data); + break; + case REQUEST_UPDATE_PARTITION: + slurm_free_update_part_msg(data); + break; + case REQUEST_DELETE_PARTITION: + slurm_free_delete_part_msg(data); + break; + case REQUEST_NODE_REGISTRATION_STATUS: + slurm_free_node_registration_status_msg(data); + break; + case REQUEST_CHECKPOINT: + slurm_free_checkpoint_msg(data); + break; + case REQUEST_CHECKPOINT_COMP: + slurm_free_checkpoint_comp_msg(data); + break; + case REQUEST_SUSPEND: + slurm_free_suspend_msg(data); + break; + case REQUEST_JOB_READY: + slurm_free_job_id_msg(data); + break; + case REQUEST_NODE_SELECT_INFO: + slurm_free_node_select_msg(data); + break; + case REQUEST_STEP_COMPLETE: + slurm_free_step_complete_msg(data); + break; + case MESSAGE_STAT_JOBACCT: + slurm_free_stat_jobacct_msg(data); + break; + case REQUEST_BATCH_JOB_LAUNCH: + slurm_free_job_launch_msg(data); + break; + case REQUEST_LAUNCH_TASKS: + slurm_free_launch_tasks_request_msg(data); + break; + case REQUEST_SPAWN_TASK: + slurm_free_spawn_task_request_msg(data); + break; + case REQUEST_SIGNAL_TASKS: + case REQUEST_TERMINATE_TASKS: + slurm_free_kill_tasks_msg(data); + break; + case REQUEST_KILL_TIMELIMIT: + slurm_free_timelimit_msg(data); + break; + case REQUEST_REATTACH_TASKS: + slurm_free_reattach_tasks_request_msg(data); + break; + case REQUEST_SIGNAL_JOB: + slurm_free_signal_job_msg(data); + break; + case REQUEST_TERMINATE_JOB: + slurm_free_kill_job_msg(data); + break; + case REQUEST_UPDATE_JOB_TIME: + slurm_free_update_job_time_msg(data); + break; + case REQUEST_JOB_ID: + slurm_free_job_id_request_msg(data); + break; + case REQUEST_FILE_BCAST: + slurm_free_file_bcast_msg(data); + break; + case RESPONSE_SLURM_RC: + slurm_free_return_code_msg(data); + break; + default: + error("invalid type trying to be freed %u", type); + break; + } + return SLURM_SUCCESS; +} + diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index a9dbc8564acc8e23e41c9ca1339a8da9cc085214..ad1d6f1c899c78db312cc0024d2953c8f640cacf 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -150,6 +150,7 @@ typedef enum { REQUEST_COMPLETE_JOB_ALLOCATION, REQUEST_COMPLETE_BATCH_SCRIPT, MESSAGE_STAT_JOBACCT, + RESPONSE_STEP_LAYOUT, REQUEST_JOB_REQUEUE, REQUEST_LAUNCH_TASKS = 6001, @@ -684,6 +685,7 @@ void inline slurm_free_step_complete_msg(step_complete_msg_t *msg); void inline slurm_free_stat_jobacct_msg(stat_jobacct_msg_t *msg); void inline slurm_free_node_select_msg( node_info_select_request_msg_t *msg); +extern int slurm_free_msg_data(uint32_t type, void *data); extern char *job_reason_string(enum job_wait_reason inx); extern char *job_state_string(enum job_states inx); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index a445a3465301111165ed79c22e87cf0f9e32490f..294be916d36000fe9ac04836ac5859c1a46ed9c8 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -51,6 +51,7 @@ #include "src/common/xassert.h" #include "src/common/forward.h" #include "src/common/job_options.h" +#include "src/common/dist_tasks.h" #define _pack_job_info_msg(msg,buf) _pack_buffer_msg(msg,buf) #define _pack_job_step_info_msg(msg,buf) _pack_buffer_msg(msg,buf) @@ -243,10 +244,6 @@ static void _pack_slurm_addr_array(slurm_addr * slurm_address, static int _unpack_slurm_addr_array(slurm_addr ** slurm_address, uint16_t * size_val, Buf buffer); -static void -_pack_slurm_step_layout(slurm_step_layout_t *step_layout, Buf buffer); -static int _unpack_slurm_step_layout(slurm_step_layout_t **layout, Buf buffer); - static void _pack_ret_list(List ret_list, uint16_t size_val, Buf buffer); static int _unpack_ret_list(List *ret_list, uint16_t size_val, Buf buffer); @@ -547,6 +544,10 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) _pack_stat_jobacct_msg((stat_jobacct_msg_t *) msg->data, buffer); break; + case RESPONSE_STEP_LAYOUT: + pack_slurm_step_layout((slurm_step_layout_t *)msg->data, + buffer); + break; case REQUEST_SIGNAL_JOB: _pack_signal_job_msg((signal_job_msg_t *) msg->data, buffer); break; @@ -833,6 +834,10 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) rc = _unpack_stat_jobacct_msg( (stat_jobacct_msg_t **) &(msg->data), buffer); break; + case RESPONSE_STEP_LAYOUT: + unpack_slurm_step_layout((slurm_step_layout_t **)&msg->data, + buffer); + break; case REQUEST_SIGNAL_JOB: rc = _unpack_signal_job_msg((signal_job_msg_t **)&(msg->data), buffer); @@ -1101,9 +1106,7 @@ _pack_resource_allocation_response_msg(resource_allocation_response_msg_t * } pack16((uint16_t)msg->node_cnt, buffer); - if (msg->node_cnt > 0) - _pack_slurm_addr_array(msg->node_addr, msg->node_cnt, buffer); - + select_g_pack_jobinfo(msg->select_jobinfo, buffer); } @@ -1143,15 +1146,7 @@ _unpack_resource_allocation_response_msg(resource_allocation_response_msg_t } safe_unpack16(&tmp_ptr->node_cnt, buffer); - if (tmp_ptr->node_cnt > 0) { - if (_unpack_slurm_addr_array(&(tmp_ptr->node_addr), - &uint16_tmp, buffer)) - goto unpack_error; - if (uint16_tmp != tmp_ptr->node_cnt) - goto unpack_error; - } else - tmp_ptr->node_addr = NULL; - + if (select_g_alloc_jobinfo (&tmp_ptr->select_jobinfo) || select_g_unpack_jobinfo(tmp_ptr->select_jobinfo, buffer)) goto unpack_error; @@ -1634,11 +1629,18 @@ static void _pack_job_step_create_response_msg(job_step_create_response_msg_t * msg, Buf buffer) { + int i; xassert(msg != NULL); pack32((uint32_t)msg->job_step_id, buffer); + pack16((uint16_t)msg->node_cnt, buffer); packstr(msg->node_list, buffer); - _pack_slurm_step_layout(msg->step_layout, buffer); + slurm_pack_slurm_addr_array(msg->node_addr, msg->node_cnt, buffer); + + pack32_array(msg->tasks, msg->node_cnt, buffer); + for(i=0; i<msg->node_cnt; i++) + pack32_array(msg->tids[i], msg->tasks[i], buffer); + slurm_cred_pack(msg->cred, buffer); switch_pack_jobinfo(msg->switch_job, buffer); @@ -1648,8 +1650,10 @@ static int _unpack_job_step_create_response_msg(job_step_create_response_msg_t ** msg, Buf buffer) { + job_step_create_response_msg_t *tmp_ptr = NULL; uint16_t uint16_tmp; - job_step_create_response_msg_t *tmp_ptr; + uint32_t uint32_tmp; + int i; /* alloc memory for structure */ xassert(msg != NULL); @@ -1657,12 +1661,25 @@ _unpack_job_step_create_response_msg(job_step_create_response_msg_t ** msg, *msg = tmp_ptr; safe_unpack32(&tmp_ptr->job_step_id, buffer); + + safe_unpack16(&tmp_ptr->node_cnt, buffer); safe_unpackstr_xmalloc(&tmp_ptr->node_list, &uint16_tmp, buffer); - - if(_unpack_slurm_step_layout(&tmp_ptr->step_layout, buffer) - != SLURM_SUCCESS) + if (slurm_unpack_slurm_addr_array( + &(tmp_ptr->node_addr), &uint16_tmp, buffer)) + goto unpack_error; + if (uint16_tmp != tmp_ptr->node_cnt) goto unpack_error; + safe_unpack32_array(&(tmp_ptr->tasks), &uint32_tmp, buffer); + if (uint32_tmp != tmp_ptr->node_cnt) + goto unpack_error; + tmp_ptr->tids = xmalloc(sizeof(uint32_t *) * tmp_ptr->node_cnt); + for(i=0; i<tmp_ptr->node_cnt; i++) { + safe_unpack32_array(&(tmp_ptr->tids[i]), &uint32_tmp, buffer); + if (uint32_tmp != tmp_ptr->tasks[i]) + goto unpack_error; + } + if (!(tmp_ptr->cred = slurm_cred_unpack(buffer))) goto unpack_error; @@ -1675,7 +1692,6 @@ _unpack_job_step_create_response_msg(job_step_create_response_msg_t ** msg, return SLURM_SUCCESS; unpack_error: - xfree(tmp_ptr->node_list); xfree(tmp_ptr); *msg = NULL; return SLURM_ERROR; @@ -3203,159 +3219,16 @@ static void _pack_slurm_addr_array(slurm_addr * slurm_address, uint16_t size_val, Buf buffer) { - int i = 0; - uint16_t nl = htons(size_val); - pack16((uint16_t)nl, buffer); - - for (i = 0; i < size_val; i++) { - slurm_pack_slurm_addr(slurm_address + i, buffer); - } - + slurm_pack_slurm_addr_array(slurm_address, size_val, buffer); } static int _unpack_slurm_addr_array(slurm_addr ** slurm_address, uint16_t * size_val, Buf buffer) { - int i = 0; - uint16_t nl; - - *slurm_address = NULL; - safe_unpack16(&nl, buffer); - *size_val = ntohs(nl); - *slurm_address = xmalloc((*size_val) * sizeof(slurm_addr)); - - for (i = 0; i < *size_val; i++) { - if (slurm_unpack_slurm_addr_no_alloc((*slurm_address) + i, - buffer)) - goto unpack_error; - - } - return SLURM_SUCCESS; - -unpack_error: - xfree(*slurm_address); - *slurm_address = NULL; - return SLURM_ERROR; -} - -static void -_pack_slurm_step_layout(slurm_step_layout_t *step_layout, Buf buffer) -{ - int i; - packstr(step_layout->nodes, buffer); - pack16(step_layout->num_hosts, buffer); - pack32(step_layout->num_tasks, buffer); - pack16(step_layout->num_cpu_groups, buffer); - _pack_slurm_addr_array(step_layout->node_addr, - step_layout->num_hosts, buffer); - - pack32_array(step_layout->cpus_per_node, - step_layout->num_cpu_groups, buffer); - pack32_array(step_layout->cpu_count_reps, - step_layout->num_cpu_groups, buffer); - - pack32_array(step_layout->tasks, step_layout->num_hosts, buffer); - pack32_array(step_layout->hostids, step_layout->num_tasks, buffer); - for(i=0; i<step_layout->num_hosts; i++) { - pack32_array(step_layout->tids[i], step_layout->tasks[i], - buffer); - } + return slurm_unpack_slurm_addr_array(slurm_address, size_val, buffer); } -static int -_unpack_slurm_step_layout(slurm_step_layout_t **layout, Buf buffer) -{ - uint16_t uint16_tmp; - uint32_t uint32_tmp; - int cpu_cnt = 0, cpu_inx = 0, i; - slurm_step_layout_t *step_layout; - hostlist_t hl = NULL; - - step_layout = xmalloc(sizeof(slurm_step_layout_t)); - *layout = step_layout; - - step_layout->nodes = NULL; - step_layout->num_hosts = 0; - step_layout->host = NULL; - step_layout->tids = NULL; - step_layout->cpus = NULL; - step_layout->tasks = NULL; - step_layout->hostids = NULL; - step_layout->cpus_per_node = NULL; - step_layout->cpu_count_reps = NULL; - safe_unpackstr_xmalloc(&step_layout->nodes, &uint16_tmp, buffer); - safe_unpack16(&step_layout->num_hosts, buffer); - safe_unpack32(&step_layout->num_tasks, buffer); - safe_unpack16(&step_layout->num_cpu_groups, buffer); - - if (_unpack_slurm_addr_array(&(step_layout->node_addr), - &uint16_tmp, buffer)) - goto unpack_error; - if (uint16_tmp != step_layout->num_hosts) - goto unpack_error; - - safe_unpack32_array(&(step_layout->cpus_per_node), - &uint32_tmp, buffer); - if (uint32_tmp != step_layout->num_cpu_groups) - goto unpack_error; - - safe_unpack32_array(&(step_layout->cpu_count_reps), - &uint32_tmp, buffer); - if (uint32_tmp != step_layout->num_cpu_groups) - goto unpack_error; - - safe_unpack32_array(&(step_layout->tasks), - &uint32_tmp, buffer); - if (uint32_tmp != step_layout->num_hosts) - goto unpack_error; - - safe_unpack32_array(&(step_layout->hostids), - &uint32_tmp, buffer); - if (uint32_tmp != step_layout->num_tasks) - goto unpack_error; - - step_layout->tids = xmalloc(sizeof(uint32_t *) - * step_layout->num_hosts); - step_layout->host = xmalloc(sizeof(char *) - * step_layout->num_hosts); - step_layout->cpus = xmalloc(sizeof(uint32_t) - * step_layout->num_hosts); - - hl = hostlist_create(step_layout->nodes); - for(i=0; i<step_layout->num_hosts; i++) { - safe_unpack32_array(&(step_layout->tids[i]), - &uint32_tmp, - buffer); - if (uint32_tmp != step_layout->tasks[i]) - goto unpack_error; - - step_layout->host[i] = hostlist_shift(hl); - if(!step_layout->host[i]) { - error("hostlist incomplete for this job request"); - goto unpack_error; - } - step_layout->cpus[i] = step_layout->cpus_per_node[cpu_inx]; - - if ((++cpu_cnt) >= step_layout->cpu_count_reps[cpu_inx]) { - /* move to next record */ - cpu_inx++; - cpu_cnt = 0; - } - } - hostlist_destroy(hl); - if(step_layout->num_cpu_groups != cpu_inx) { - error("we got %d cpu groups but was looking for %d", - cpu_inx, step_layout->num_cpu_groups); - } - return SLURM_SUCCESS; - -unpack_error: - hostlist_destroy(hl); - step_layout_destroy(step_layout); - *layout = NULL; - return SLURM_ERROR; -} static void _pack_ret_list(List ret_list, diff --git a/src/plugins/jobacct/common/common_slurmctld.c b/src/plugins/jobacct/common/common_slurmctld.c index ae9df9b427ebc06b9df4b8cacb83e16dbd57e916..8f1603f8668f2daaee8073bbd98799589096347a 100644 --- a/src/plugins/jobacct/common/common_slurmctld.c +++ b/src/plugins/jobacct/common/common_slurmctld.c @@ -275,19 +275,24 @@ extern int common_step_start_slurmctld(struct step_record *step) if(quarter != (uint16_t)NO_VAL && nodecard != (uint16_t)NO_VAL) snprintf(node_list, BUFFER_SIZE, - "%s.%d.%d", step->step_node_list, quarter, nodecard); + "%s.%d.%d", step->step_layout->node_list, + quarter, nodecard); else if(quarter != (uint16_t)NO_VAL) snprintf(node_list, BUFFER_SIZE, - "%s.%d", step->step_node_list, quarter); + "%s.%d", step->step_layout->node_list, quarter); else - snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list); + snprintf(node_list, BUFFER_SIZE, "%s", + step->step_layout->node_list); #else - if(!step->num_cpus) + if(!step->step_layout || !step->step_layout->task_cnt) { cpus = step->job_ptr->num_procs; - else - cpus = step->num_cpus; - snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list); + snprintf(node_list, BUFFER_SIZE, "%s", step->job_ptr->nodes); + } else { + cpus = step->step_layout->task_cnt; + snprintf(node_list, BUFFER_SIZE, "%s", + step->step_layout->node_list); + } #endif if (step->job_ptr->account && step->job_ptr->account[0]) account = step->job_ptr->account; @@ -299,7 +304,7 @@ extern int common_step_start_slurmctld(struct step_record *step) step->step_id, /* stepid */ JOB_RUNNING, /* completion status */ 0, /* completion code */ - step->num_tasks, /* number of tasks */ + cpus, /* number of tasks */ cpus, /* number of cpus */ 0, /* elapsed seconds */ 0, /* total cputime seconds */ @@ -387,30 +392,36 @@ extern int common_step_complete_slurmctld(struct step_record *step) if(quarter != (uint16_t)NO_VAL && nodecard != (uint16_t)NO_VAL) snprintf(node_list, BUFFER_SIZE, - "%s.%d.%d", step->step_node_list, quarter, nodecard); + "%s.%d.%d", step->step_layout->node_list, + quarter, nodecard); else if(quarter != (uint16_t)NO_VAL) snprintf(node_list, BUFFER_SIZE, - "%s.%d", step->step_node_list, quarter); + "%s.%d", step->step_layout->node_list, quarter); else - snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list); + snprintf(node_list, BUFFER_SIZE, "%s", + step->step_layout->node_list); #else - if(!step->num_cpus) + if(!step->step_layout || !step->step_layout->task_cnt) { cpus = step->job_ptr->num_procs; - else - cpus = step->num_cpus; - snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list); + snprintf(node_list, BUFFER_SIZE, "%s", step->job_ptr->nodes); + + } else { + cpus = step->step_layout->task_cnt; + snprintf(node_list, BUFFER_SIZE, "%s", + step->step_layout->node_list); + } #endif /* figure out the ave of the totals sent */ - if(step->num_tasks > 0) { + if(cpus > 0) { ave_vsize = jobacct->tot_vsize; - ave_vsize /= step->num_tasks; + ave_vsize /= cpus; ave_rss = jobacct->tot_rss; - ave_rss /= step->num_tasks; + ave_rss /= cpus; ave_pages = jobacct->tot_pages; - ave_pages /= step->num_tasks; + ave_pages /= cpus; ave_cpu = jobacct->tot_cpu; - ave_cpu /= step->num_tasks; + ave_cpu /= cpus; ave_cpu /= 100; } ave_cpu2 = jobacct->min_cpu; @@ -426,7 +437,7 @@ extern int common_step_complete_slurmctld(struct step_record *step) step->step_id, /* stepid */ comp_status, /* completion status */ step->exit_code, /* completion code */ - step->num_tasks, /* number of tasks */ + cpus, /* number of tasks */ cpus, /* number of cpus */ elapsed, /* elapsed seconds */ /* total cputime seconds */ diff --git a/src/sacct/sacct_stat.c b/src/sacct/sacct_stat.c index d438b31ad97727cd178fb3edc6013e141f7dfa17..78d65870a9a4ea9ba82066e00285eef16e907b83 100644 --- a/src/sacct/sacct_stat.c +++ b/src/sacct/sacct_stat.c @@ -39,7 +39,8 @@ step_rec_t step; int thr_finished = 0; void *_stat_thread(void *args); -int _sacct_query(resource_allocation_response_msg_t *job, uint32_t step_id); +int _sacct_query(slurm_step_layout_t *step_layout, uint32_t job_id, + uint32_t step_id); int _process_results(); void *_stat_thread(void *args) @@ -149,36 +150,37 @@ cleanup: return NULL; } -int _sacct_query(resource_allocation_response_msg_t *job, uint32_t step_id) +int _sacct_query(slurm_step_layout_t *step_layout, uint32_t job_id, + uint32_t step_id) { slurm_msg_t *msg_array_ptr; stat_jobacct_msg_t r; int i; - int *span = set_span(job->node_cnt, 0); + int *span = set_span(step_layout->node_cnt, 0); forward_t forward; int thr_count = 0; debug("getting the stat of job %d on %d nodes", - job->job_id, job->node_cnt); + job_id, step_layout->node_cnt); memset(&step.sacct, 0, sizeof(sacct_t)); step.sacct.min_cpu = (float)NO_VAL; - step.header.jobnum = job->job_id; + step.header.jobnum = job_id; step.header.partition = NULL; step.header.blockid = NULL; step.stepnum = step_id; - step.nodes = job->node_list; + step.nodes = step_layout->node_list; step.stepname = NULL; step.status = JOB_RUNNING; step.ntasks = 0; - msg_array_ptr = xmalloc(sizeof(slurm_msg_t) * job->node_cnt); + msg_array_ptr = xmalloc(sizeof(slurm_msg_t) * step_layout->node_cnt); /* Common message contents */ - r.job_id = job->job_id; + r.job_id = job_id; r.step_id = step_id; r.jobacct = jobacct_g_alloc(NULL); - forward.cnt = job->node_cnt; + forward.cnt = step_layout->node_cnt; /* we need this for forwarding, but not really anything else, so this can be set to any sting as long as there are the same number as hosts we are going to */ @@ -186,12 +188,12 @@ int _sacct_query(resource_allocation_response_msg_t *job, uint32_t step_id) for(i=0; i < forward.cnt; i++) { strncpy(&forward.name[i*MAX_SLURM_NAME], "-", MAX_SLURM_NAME); } - forward.addr = job->node_addr; + forward.addr = step_layout->node_addr; forward.node_id = NULL; forward.timeout = 5000; thr_count = 0; - for (i = 0; i < job->node_cnt; i++) { + for (i = 0; i < forward.cnt; i++) { pthread_attr_t attr; pthread_t threadid; slurm_msg_t *m = &msg_array_ptr[thr_count]; @@ -203,7 +205,7 @@ int _sacct_query(resource_allocation_response_msg_t *job, uint32_t step_id) m->orig_addr.sin_addr.s_addr = 0; memcpy(&m->address, - &job->node_addr[i], + &forward.addr[i], sizeof(slurm_addr)); forward_set(&m->forward, @@ -268,7 +270,7 @@ int sacct_stat(uint32_t jobid, uint32_t stepid) slurm_msg_t req_msg; slurm_msg_t resp_msg; stat_jobacct_msg_t req; - resource_allocation_response_msg_t *job = NULL; + slurm_step_layout_t *step_layout = NULL; int rc = SLURM_SUCCESS; debug("requesting info for job %u.%u", jobid, stepid); @@ -286,8 +288,8 @@ int sacct_stat(uint32_t jobid, uint32_t stepid) jobacct_g_free(req.jobacct); switch (resp_msg.msg_type) { - case RESPONSE_RESOURCE_ALLOCATION: - job = (resource_allocation_response_msg_t *)resp_msg.data; + case RESPONSE_STEP_LAYOUT: + step_layout = (slurm_step_layout_t *)resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; @@ -300,13 +302,13 @@ int sacct_stat(uint32_t jobid, uint32_t stepid) break; } - if(!job) { + if(!step_layout) { error("didn't get the job record rc = %s", slurm_strerror(rc)); return rc; } - _sacct_query(job, stepid); - slurm_free_resource_allocation_response_msg(job); + _sacct_query(step_layout, jobid, stepid); + step_layout_destroy(step_layout); _process_results(); return rc; diff --git a/src/slaunch/slaunch.c b/src/slaunch/slaunch.c index a843aaf8c44f11e3fc02db61757ad96cce7551bf..347ae1d890fd277db3ef2b808a58641b5f8c1ab9 100644 --- a/src/slaunch/slaunch.c +++ b/src/slaunch/slaunch.c @@ -97,7 +97,7 @@ static void _run_srun_epilog (void); static int _run_srun_script (char *script); #endif static void _setup_local_fds(slurm_step_io_fds_t *cio_fds, int jobid, - int stepid, slurm_step_layout_t *step_layout); + int stepid, uint32_t *hostids); static void _task_start(launch_tasks_response_msg_t *msg); static void _task_finish(task_exit_msg_t *msg); static void _mpir_init(int num_tasks); @@ -112,9 +112,10 @@ int slaunch(int argc, char **argv) slurm_step_ctx step_ctx; slurm_job_step_launch_t params; int rc; - + uint32_t *hostids = NULL; log_init(xbasename(argv[0]), logopt, 0, NULL); - + int task_cnt = 0, i, j; + /* Initialize plugin stack, read options from plugins, etc. */ if (spank_init(NULL) < 0) fatal("Plug-in initialization failed"); @@ -199,10 +200,21 @@ int slaunch(int argc, char **argv) params.remote_output_filename = opt.remote_ofname; params.remote_input_filename = opt.remote_ifname; params.remote_error_filename = opt.remote_efname; + + /* set up the hostids */ + for (i=0; i<step_ctx->step_resp->node_cnt; i++) { + task_cnt += step_ctx->step_resp->tasks[i]; + } + + hostids = xmalloc(sizeof(uint32_t) * task_cnt); + for (i=0; i < step_ctx->step_resp->node_cnt; i++) + for (j=0; j<step_ctx->step_resp->tasks[i]; j++) + hostids[step_ctx->step_resp->tids[i][j]] = i; + + /* FIXME - don't peek into the step context, that's cheating! */ _setup_local_fds(¶ms.local_fds, (int)step_ctx->job_id, - (int)step_ctx->step_resp->job_step_id, - step_ctx->step_resp->step_layout); + (int)step_ctx->step_resp->job_step_id, hostids); params.parallel_debug = opt.parallel_debug ? true : false; params.task_start_callback = _task_start; params.task_finish_callback = _task_finish; @@ -423,7 +435,7 @@ static int _run_srun_script (char *script) static void _setup_local_fds(slurm_step_io_fds_t *cio_fds, int jobid, int stepid, - slurm_step_layout_t *step_layout) + uint32_t *hostids) { bool err_shares_out = false; fname_t *ifname, *ofname, *efname; @@ -439,8 +451,9 @@ _setup_local_fds(slurm_step_io_fds_t *cio_fds, int jobid, int stepid, cio_fds->in.fd = STDIN_FILENO; } else if (ifname->type == IO_ONE) { cio_fds->in.taskid = ifname->taskid; - cio_fds->in.nodeid = step_layout_host_id( - step_layout, ifname->taskid); + cio_fds->in.nodeid = hostids[ifname->taskid]; +// step_layout_host_id( + // step_layout, ifname->taskid); } else { cio_fds->in.fd = open(ifname->name, O_RDONLY); if (cio_fds->in.fd == -1) diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index d7f234fe540d9351ffeaf4d9acd496f88f853815..5a781e1d579ea13c33af3d7eee229a3cdc864c01 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -860,7 +860,6 @@ static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer) pack16((uint16_t) step_ptr->step_id, buffer); pack16((uint16_t) step_ptr->cyclic_alloc, buffer); pack16((uint16_t)step_ptr->port, buffer); - pack32(step_ptr->num_tasks, buffer); pack32(step_ptr->exit_code, buffer); if (step_ptr->exit_code != NO_VAL) { pack_bit_fmt(step_ptr->exit_node_bitmap, buffer); @@ -870,44 +869,46 @@ static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer) pack_time(step_ptr->start_time, buffer); packstr(step_ptr->host, buffer); - packstr(step_ptr->step_node_list, buffer); packstr(step_ptr->name, buffer); packstr(step_ptr->network, buffer); pack16((uint16_t)step_ptr->batch_step, buffer); - if (!step_ptr->batch_step) + if (!step_ptr->batch_step) { + pack_slurm_step_layout(step_ptr->step_layout, buffer); switch_pack_jobinfo(step_ptr->switch_job, buffer); + } checkpoint_pack_jobinfo(step_ptr->check_job, buffer); } /* Unpack job step state information from a buffer */ static int _load_step_state(struct job_record *job_ptr, Buf buffer) { - struct step_record *step_ptr; + struct step_record *step_ptr = NULL; uint16_t step_id, cyclic_alloc, name_len, port, batch_step, bit_cnt; - uint32_t num_tasks, exit_code; + uint32_t exit_code; time_t start_time; - char *step_node_list = NULL, *host = NULL; + char *host = NULL; char *name = NULL, *network = NULL, *bit_fmt = NULL; switch_jobinfo_t switch_tmp = NULL; check_jobinfo_t check_tmp = NULL; - + slurm_step_layout_t *step_layout = NULL; + safe_unpack16(&step_id, buffer); safe_unpack16(&cyclic_alloc, buffer); safe_unpack16(&port, buffer); - safe_unpack32(&num_tasks, buffer); safe_unpack32(&exit_code, buffer); if (exit_code != NO_VAL) { safe_unpackstr_xmalloc(&bit_fmt, &name_len, buffer); safe_unpack16(&bit_cnt, buffer); } - + safe_unpack_time(&start_time, buffer); safe_unpackstr_xmalloc(&host, &name_len, buffer); - safe_unpackstr_xmalloc(&step_node_list, &name_len, buffer); safe_unpackstr_xmalloc(&name, &name_len, buffer); safe_unpackstr_xmalloc(&network, &name_len, buffer); safe_unpack16(&batch_step, buffer); if (!batch_step) { + if (unpack_slurm_step_layout(&step_layout, buffer)) + goto unpack_error; switch_alloc_jobinfo(&switch_tmp); if (switch_unpack_jobinfo(switch_tmp, buffer)) goto unpack_error; @@ -929,22 +930,20 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer) if (step_ptr == NULL) goto unpack_error; - /* free any left-over values */ - xfree(step_ptr->step_node_list); - /* set new values */ step_ptr->step_id = step_id; step_ptr->cyclic_alloc = cyclic_alloc; step_ptr->name = name; step_ptr->network = network; - step_ptr->num_tasks = num_tasks; step_ptr->port = port; step_ptr->host = host; step_ptr->batch_step = batch_step; host = NULL; /* re-used, nothing left to free */ step_ptr->start_time = start_time; - step_ptr->step_node_list = step_node_list; - step_node_list = NULL; /* re-used, nothing left to free */ + + step_layout_destroy(step_ptr->step_layout); + step_ptr->step_layout = step_layout; + step_ptr->time_last_active = time(NULL); step_ptr->switch_job = switch_tmp; step_ptr->check_job = check_tmp; @@ -964,7 +963,8 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer) xfree(bit_fmt); } - switch_g_job_step_allocated(switch_tmp, step_ptr->step_node_list); + switch_g_job_step_allocated(switch_tmp, + step_ptr->step_layout->node_list); info("recovered job step %u.%u", job_ptr->job_id, step_id); return SLURM_SUCCESS; @@ -972,10 +972,10 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer) xfree(host); xfree(name); xfree(network); - xfree(step_node_list); xfree(bit_fmt); if (switch_tmp) switch_free_jobinfo(switch_tmp); + step_layout_destroy(step_layout); return SLURM_FAILURE; } @@ -2973,11 +2973,12 @@ static void _reset_step_bitmaps(struct job_record *job_ptr) step_iterator = list_iterator_create (job_ptr->step_list); while ((step_ptr = (struct step_record *) list_next (step_iterator))) { FREE_NULL_BITMAP(step_ptr->step_node_bitmap); - if ((step_ptr->step_node_list) && - (node_name2bitmap(step_ptr->step_node_list, false, - &step_ptr->step_node_bitmap))) { + if (step_ptr->step_layout && + step_ptr->step_layout->node_list && + (node_name2bitmap(step_ptr->step_layout->node_list, false, + &step_ptr->step_node_bitmap))) { error("Invalid step_node_list (%s) for step_id %u.%u", - step_ptr->step_node_list, + step_ptr->step_layout->node_list, job_ptr->job_id, step_ptr->step_id); delete_step_record (job_ptr, step_ptr->step_id); } diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 0975a196486e7fa3afec53cc08c44165217fc1ea..64a53875336550b9a21a5f1c06898f819f8d755d 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -465,7 +465,6 @@ find_node_record (char *name) return (struct node_record *) NULL; } - /* * _hash_index - return a hash table index for the given node name * IN name = the node's name diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index edc105dcde5a0bcad4728acea314c0ece1a1f7ed..5461d70fcab2ff11594aab55aee2326d885022e6 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -419,7 +419,7 @@ static int _make_step_cred(struct step_record *step_rec, cred_arg.jobid = step_rec->job_ptr->job_id; cred_arg.stepid = step_rec->step_id; cred_arg.uid = step_rec->job_ptr->user_id; - cred_arg.hostlist = step_rec->step_node_list; + cred_arg.hostlist = step_rec->step_layout->node_list; if(step_rec->job_ptr->details->exclusive) cred_arg.ntask_cnt = 0; else @@ -501,10 +501,6 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) (sizeof(uint32_t) * job_ptr->num_cpu_groups)); alloc_msg.error_code = error_code; alloc_msg.job_id = job_ptr->job_id; - alloc_msg.node_addr = xmalloc(sizeof(slurm_addr) * - job_ptr->node_cnt); - memcpy(alloc_msg.node_addr, job_ptr->node_addr, - (sizeof(slurm_addr) * job_ptr->node_cnt)); alloc_msg.node_cnt = job_ptr->node_cnt; alloc_msg.node_list = xstrdup(job_ptr->nodes); alloc_msg.num_cpu_groups = job_ptr->num_cpu_groups; @@ -522,7 +518,6 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) _kill_job_on_msg_fail(job_ptr->job_id); xfree(alloc_msg.cpu_count_reps); xfree(alloc_msg.cpus_per_node); - xfree(alloc_msg.node_addr); xfree(alloc_msg.node_list); select_g_free_jobinfo(&alloc_msg.select_jobinfo); schedule_job_save(); /* has own locks */ @@ -1004,6 +999,7 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; uid_t uid; + int i; START_TIMER; debug2("Processing RPC: REQUEST_JOB_STEP_CREATE"); @@ -1044,17 +1040,38 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) slurm_strerror(error_code)); slurm_send_rc_msg(msg, error_code); } else { + slurm_step_layout_t *layout = step_rec->step_layout; + info("_slurm_rpc_job_step_create: StepId=%u.%u %s %s", step_rec->job_ptr->job_id, step_rec->step_id, req_step_msg->node_list, TIME_STR); job_step_resp.job_step_id = step_rec->step_id; - job_step_resp.node_list = xstrdup(req_step_msg->node_list); + job_step_resp.node_cnt = layout->node_cnt; + job_step_resp.node_list = xstrdup(layout->node_list); + job_step_resp.node_addr = xmalloc(sizeof(slurm_addr) * + layout->node_cnt); + memcpy(job_step_resp.node_addr, layout->node_addr, + (sizeof(slurm_addr) * layout->node_cnt)); + + job_step_resp.tasks = + xmalloc(sizeof(uint32_t) * layout->node_cnt); + memcpy(job_step_resp.tasks, layout->tasks, + (sizeof(uint32_t) * layout->node_cnt)); + + job_step_resp.tids = + xmalloc(sizeof(uint32_t *) * layout->node_cnt); + for (i=0; i<layout->node_cnt; i++) { + job_step_resp.tids[i] = xmalloc(sizeof(uint32_t) * + layout->tasks[i]); + memcpy(job_step_resp.tids[i], layout->tids[i], + (sizeof(uint32_t) * layout->tasks[i])); + } + job_step_resp.cred = slurm_cred; job_step_resp.switch_job = switch_copy_jobinfo( step_rec->switch_job); - job_step_resp.step_layout = - step_layout_copy(step_rec->step_layout); + unlock_slurmctld(job_write_lock); resp.address = msg->address; resp.msg_type = RESPONSE_JOB_STEP_CREATE; @@ -1064,10 +1081,17 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) resp.forward_struct_init = 0; slurm_send_node_msg(msg->conn_fd, &resp); - xfree(job_step_resp.node_list); slurm_cred_destroy(slurm_cred); switch_free_jobinfo(job_step_resp.switch_job); - step_layout_destroy(job_step_resp.step_layout); + xfree(job_step_resp.node_list); + xfree(job_step_resp.node_addr); + for (i=0; i<job_step_resp.node_cnt; i++) { + xfree(job_step_resp.tids[i]); + } + + xfree(job_step_resp.tasks); + xfree(job_step_resp.tids); + schedule_job_save(); /* Sets own locks */ } } @@ -1368,10 +1392,6 @@ static void _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg) (sizeof(uint32_t) * job_ptr->num_cpu_groups)); job_info_resp_msg.error_code = error_code; job_info_resp_msg.job_id = job_info_msg->job_id; - job_info_resp_msg.node_addr = xmalloc(sizeof(slurm_addr) * - job_ptr->node_cnt); - memcpy(job_info_resp_msg.node_addr, job_ptr->node_addr, - (sizeof(slurm_addr) * job_ptr->node_cnt)); job_info_resp_msg.node_cnt = job_ptr->node_cnt; job_info_resp_msg.node_list = xstrdup(job_ptr->nodes); job_info_resp_msg.num_cpu_groups = job_ptr->num_cpu_groups; @@ -1389,7 +1409,6 @@ static void _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg) select_g_free_jobinfo(&job_info_resp_msg.select_jobinfo); xfree(job_info_resp_msg.cpu_count_reps); xfree(job_info_resp_msg.cpus_per_node); - xfree(job_info_resp_msg.node_addr); xfree(job_info_resp_msg.node_list); } } @@ -1645,23 +1664,18 @@ static void _slurm_rpc_step_complete(slurm_msg_t *msg) * represent the termination of an entire job */ static void _slurm_rpc_stat_jobacct(slurm_msg_t * msg) { - int error_code = SLURM_SUCCESS, i = 0, i2 = 0, i3 = 0; - int count = 0, count2 = 0; + int error_code = SLURM_SUCCESS; slurm_msg_t response_msg; DEF_TIMERS; stat_jobacct_msg_t *req = (stat_jobacct_msg_t *)msg->data; - resource_allocation_response_msg_t resp; + slurm_step_layout_t *step_layout = NULL; /* Locks: Write job, write node */ slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, READ_LOCK, NO_LOCK }; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred); struct job_record *job_ptr = NULL; struct step_record *step_ptr = NULL; - char bitstring[BUFFER_SIZE]; - int *node_pos = NULL; - int node_cnt = 0; - - + START_TIMER; debug2("Processing RPC: MESSAGE_STAT_JOBACCT"); @@ -1686,55 +1700,17 @@ static void _slurm_rpc_stat_jobacct(slurm_msg_t * msg) slurm_send_rc_msg(msg, ESLURM_INVALID_JOB_ID); return; } - node_cnt = bit_set_count(step_ptr->step_node_bitmap); - resp.node_addr = xmalloc(sizeof(slurm_addr) * node_cnt); - - bit_fmt(bitstring, BUFFER_SIZE, step_ptr->step_node_bitmap); - node_pos = bitfmt2int(bitstring); - count = 0; - count2 = 0; - i = node_pos[count++]; - i2 = node_pos[count++]; - while(i != -1) { - if(i2 == -1) { - memcpy(&resp.node_addr[count2++], - &node_record_table_ptr[i].slurm_addr, - sizeof(slurm_addr)); - - - } else { - for(i3=i; i3 <= i2; i3++) { - if(i3 == -1) { - error("error with bitfmt2int " - "on the %d one", - i3); - break; - } - memcpy(&resp.node_addr[count2++], - &node_record_table_ptr[i3]. - slurm_addr, - sizeof(slurm_addr)); - } - } - i = node_pos[count++]; - i2 = node_pos[count++]; - } - resp.node_list = xstrdup(step_ptr->step_node_list); - resp.node_cnt = node_cnt; - resp.job_id = req->job_id; - resp.num_cpu_groups = 0; - resp.select_jobinfo = NULL; + step_layout = step_layout_copy(step_ptr->step_layout); unlock_slurmctld(job_read_lock); - response_msg.msg_type = RESPONSE_RESOURCE_ALLOCATION; - response_msg.data = &resp; + + response_msg.msg_type = RESPONSE_STEP_LAYOUT; + response_msg.data = step_layout; forward_init(&response_msg.forward, NULL); response_msg.ret_list = NULL; response_msg.forward_struct_init = 0; slurm_send_node_msg(msg->conn_fd, &response_msg); - xfree(resp.node_list); - xfree(resp.node_addr); - xfree(node_pos); + step_layout_destroy(step_layout); } } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 6d9ec2a9cb58fc6f7c4eaf2a5c896fc74394378d..b535e43c835348e779831f3b81ec06c5f0bf596d 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -351,11 +351,7 @@ struct step_record { uint16_t step_id; /* step number */ uint16_t cyclic_alloc; /* set for cyclic task allocation across nodes */ - uint32_t num_tasks; /* number of tasks required */ - uint32_t num_cpus; /* number of cpus required */ time_t start_time; /* step allocation time */ - char *step_node_list; /* list of nodes allocated to job - step */ bitstr_t *step_node_bitmap; /* bitmap of nodes allocated to job step */ time_t time_last_active; /* time of last job activity */ @@ -1149,7 +1145,7 @@ extern int slurmctld_shutdown(void); /* * step_create - creates a step_record in step_specs->job_id, sets up the - * accoding to the step_specs. + * according to the step_specs. * IN step_specs - job step specifications * OUT new_step_record - pointer to the new step_record (NULL on error) * IN kill_job_when_step_done - if set kill the job on step completion @@ -1163,6 +1159,21 @@ extern int step_create ( job_step_create_request_msg_t *step_specs, bool kill_job_when_step_done, bool batch_step ); +/* + * step_layout_create - creates a step_layout according to the inputs. + * IN job_ptr - job record step belongs to + * IN step_node_list - node list of hosts in step + * IN num_tasks - number of tasks in step + * IN task_dist - type of task distribution + * RET - NULL or slurm_step_layout_t * + * NOTE: you need to free the returned step_layout usually when the + * step is freed. + */ +extern slurm_step_layout_t *step_layout_create(struct step_record *step_ptr, + char *step_node_list, + uint16_t node_count, + uint32_t num_tasks, + uint16_t task_dist); /* * step_epilog_complete - note completion of epilog on some node and * release it's switch windows if appropriate. can perform partition diff --git a/src/slurmctld/srun_comm.c b/src/slurmctld/srun_comm.c index 7b9fa2db4ebea042afdcca1b9645325035884e9b..1141229e36de262ee450f83c922a1c5db819c096 100644 --- a/src/slurmctld/srun_comm.c +++ b/src/slurmctld/srun_comm.c @@ -88,10 +88,6 @@ extern void srun_allocate (uint32_t job_id) memcpy(msg_arg->cpu_count_reps, job_ptr->cpu_count_reps, (sizeof(uint32_t) * job_ptr->num_cpu_groups)); msg_arg->node_cnt = job_ptr->node_cnt; - msg_arg->node_addr = xmalloc(sizeof (slurm_addr) * - job_ptr->node_cnt); - memcpy(msg_arg->node_addr, job_ptr->node_addr, - (sizeof(slurm_addr) * job_ptr->node_cnt)); msg_arg->select_jobinfo = select_g_copy_jobinfo( job_ptr->select_jobinfo); msg_arg->error_code = SLURM_SUCCESS; diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 28c1db8077e424edee223b2d3551067c212a6f84..2489afc9a3c862882649d6422ca81773aa26f967 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -110,14 +110,14 @@ delete_all_step_records (struct job_record *job_ptr) while ((step_ptr = (struct step_record *) list_next (step_iterator))) { list_remove (step_iterator); if (step_ptr->switch_job) { - switch_g_job_step_complete(step_ptr->switch_job, - step_ptr->step_node_list); + switch_g_job_step_complete( + step_ptr->switch_job, + step_ptr->step_layout->node_list); switch_free_jobinfo(step_ptr->switch_job); } checkpoint_free_jobinfo(step_ptr->check_job); xfree(step_ptr->host); xfree(step_ptr->name); - xfree(step_ptr->step_node_list); step_layout_destroy(step_ptr->step_layout); jobacct_g_free(step_ptr->jobacct); FREE_NULL_BITMAP(step_ptr->step_node_bitmap); @@ -159,13 +159,12 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id) if (step_ptr->switch_job) { switch_g_job_step_complete( step_ptr->switch_job, - step_ptr->step_node_list); + step_ptr->step_layout->node_list); switch_free_jobinfo (step_ptr->switch_job); } checkpoint_free_jobinfo (step_ptr->check_job); xfree(step_ptr->host); xfree(step_ptr->name); - xfree(step_ptr->step_node_list); step_layout_destroy(step_ptr->step_layout); jobacct_g_free(step_ptr->jobacct); FREE_NULL_BITMAP(step_ptr->step_node_bitmap); @@ -625,6 +624,7 @@ step_create(job_step_create_request_msg_t *step_specs, bitstr_t *nodeset; int node_count; time_t now = time(NULL); + char *step_node_list = NULL; *new_step_record = NULL; job_ptr = find_job_record (step_specs->job_id); @@ -690,18 +690,16 @@ step_create(job_step_create_request_msg_t *step_specs, /* Here is where the node list is set for the job */ if(step_specs->node_list && step_specs->task_dist == SLURM_DIST_ARBITRARY) { - step_ptr->step_node_list = xstrdup(step_specs->node_list); + step_node_list = xstrdup(step_specs->node_list); xfree(step_specs->node_list); step_specs->node_list = bitmap2node_name(nodeset); } else { - step_ptr->step_node_list = bitmap2node_name(nodeset); - step_specs->node_list = xstrdup(step_ptr->step_node_list); + step_node_list = bitmap2node_name(nodeset); + step_specs->node_list = xstrdup(step_node_list); } step_ptr->step_node_bitmap = nodeset; step_ptr->cyclic_alloc = (uint16_t) (step_specs->task_dist == SLURM_DIST_CYCLIC); - step_ptr->num_tasks = step_specs->num_tasks; - step_ptr->num_cpus = step_specs->cpu_count; step_ptr->time_last_active = now; step_ptr->port = step_specs->port; step_ptr->host = xstrdup(step_specs->host); @@ -722,21 +720,18 @@ step_create(job_step_create_request_msg_t *step_specs, /* a batch script does not need switch info */ if (!batch_step) { step_ptr->step_layout = - distribute_tasks(job_ptr->nodes, - step_ptr->step_node_list, - job_ptr->cpus_per_node, - job_ptr->cpu_count_reps, - job_ptr->num_cpu_groups, - step_specs->node_count, - step_ptr->num_tasks, - step_specs->task_dist); + step_layout_create(step_ptr, + step_node_list, + step_specs->node_count, + step_specs->num_tasks, + step_specs->task_dist); if (!step_ptr->step_layout) return SLURM_ERROR; if (switch_alloc_jobinfo (&step_ptr->switch_job) < 0) fatal ("step_create: switch_alloc_jobinfo error"); if (switch_build_jobinfo(step_ptr->switch_job, - step_ptr->step_node_list, + step_ptr->step_layout->node_list, step_ptr->step_layout->tasks, step_ptr->cyclic_alloc, step_ptr->network) < 0) { @@ -744,12 +739,6 @@ step_create(job_step_create_request_msg_t *step_specs, delete_step_record (job_ptr, step_ptr->step_id); return ESLURM_INTERCONNECT_FAILURE; } - /* FIXME: this var should be removed all together once the - switch_build_jobinfo is rewritten to take the step_layout - structure */ - xfree(step_ptr->step_node_list); - step_ptr->step_node_list = - xstrdup(step_ptr->step_layout->nodes); } if (checkpoint_alloc_jobinfo (&step_ptr->check_job) < 0) fatal ("step_create: checkpoint_alloc_jobinfo error"); @@ -759,19 +748,68 @@ step_create(job_step_create_request_msg_t *step_specs, return SLURM_SUCCESS; } +extern slurm_step_layout_t *step_layout_create(struct step_record *step_ptr, + char *step_node_list, + uint16_t node_count, + uint32_t num_tasks, + uint16_t task_dist) +{ + uint32_t cpus_per_node[node_count]; + uint32_t cpu_count_reps[node_count]; + int cpu_inx = -1; + int usable_cpus = 0, i; + int inx = 0; + struct job_record *job_ptr = step_ptr->job_ptr; + uint32_t node_cnt = job_ptr->cpu_count_reps[inx]; + + /* set the correct cpus in use from the job */ + + for (i = 0; i < job_ptr->node_cnt; i++) { + if (bit_test(step_ptr->step_node_bitmap, i)) { + while(i >= node_cnt) + node_cnt += + job_ptr->cpu_count_reps[++inx]; + + usable_cpus = job_ptr->cpus_per_node[inx]; + if ((cpu_inx == -1) || + (cpus_per_node[cpu_inx] != + usable_cpus)) { + cpu_inx++; + cpus_per_node[cpu_inx] = usable_cpus; + cpu_count_reps[cpu_inx] = 1; + } else + cpu_count_reps[cpu_inx]++; + } + } + /* layout the tasks on the nodes */ + return distribute_tasks(step_node_list, + cpus_per_node, cpu_count_reps, (cpu_inx+1), + node_count, num_tasks, task_dist); +} + /* Pack the data for a specific job step record * IN step - pointer to a job step record * IN/OUT buffer - location to store data, pointers automatically advanced */ static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer) { + int task_cnt; + char *node_list = NULL; + + if(step->step_layout) { + task_cnt = step->step_layout->task_cnt; + node_list = step->step_layout->node_list; + } else { + task_cnt = step->job_ptr->num_procs; + node_list = step->job_ptr->nodes; + } pack_job_step_info_members(step->job_ptr->job_id, step->step_id, step->job_ptr->user_id, - step->num_tasks, + task_cnt, step->start_time, step->job_ptr->partition, - step->step_node_list, + node_list, step->name, step->network, buffer); } @@ -1116,12 +1154,12 @@ extern int step_partial_comp(step_complete_msg_t *req, int *rem, /* release all switch windows */ if (step_ptr->switch_job) { debug2("full switch release for step %u.%u, " - "nodes %s", req->job_id, - req->job_step_id, - step_ptr->step_node_list); + "nodes %s", req->job_id, + req->job_step_id, + step_ptr->step_layout->node_list); switch_g_job_step_complete( step_ptr->switch_job, - step_ptr->step_node_list); + step_ptr->step_layout->node_list); switch_free_jobinfo (step_ptr->switch_job); step_ptr->switch_job = NULL; } diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 0eef5562b78b485a42f7e72fce42b5883a997a83..bf658374c91b0b5625b8d7ea70352c2b275a2dfc 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -592,6 +592,7 @@ create_job_step(srun_job_t *job) { job_step_create_request_msg_t *req = NULL; job_step_create_response_msg_t *resp = NULL; + int i; if (!(req = _step_req_create(job))) { error ("Unable to allocate step request message"); return -1; @@ -605,7 +606,16 @@ create_job_step(srun_job_t *job) job->stepid = resp->job_step_id; job->cred = resp->cred; job->switch_job = resp->switch_job; - job->step_layout = resp->step_layout; + job->step_layout = xmalloc(sizeof(slurm_step_layout_t)); + job->step_layout->node_cnt = resp->node_cnt; + job->step_layout->node_list = resp->node_list; + job->step_layout->node_addr = resp->node_addr; + job->step_layout->tasks = resp->tasks; + job->step_layout->task_cnt = 0; + for(i=0; i<job->step_layout->node_cnt; i++) + job->step_layout->task_cnt += job->step_layout->tasks[i]; + job->step_layout->tids = resp->tids; + if(!job->step_layout) { error("step_layout not returned"); return -1; diff --git a/src/srun/launch.c b/src/srun/launch.c index 4f6a9fb01153f5ee2ff9625805cca11fd072497e..77188ccf6b1792e1d8b8197a210bebecba9fca2b 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -119,17 +119,17 @@ launch(void *arg) hostlist_t hostlist = NULL; hostlist_iterator_t itr = NULL; char *host = NULL; - int *span = set_span(job->step_layout->num_hosts, 0); + int *span = set_span(job->step_layout->node_cnt, 0); slurm_msg_t *m = NULL; Buf buffer = NULL; update_job_state(job, SRUN_JOB_LAUNCHING); debug("going to launch %d tasks on %d hosts", - opt.nprocs, job->step_layout->num_hosts); + opt.nprocs, job->step_layout->node_cnt); msg_array_ptr = xmalloc(sizeof(slurm_msg_t) - * job->step_layout->num_hosts); + * job->step_layout->node_cnt); my_envc = envcount(environ); /* Common message contents */ @@ -143,7 +143,7 @@ launch(void *arg) r.envc = my_envc; r.env = environ; r.cwd = opt.cwd; - r.nnodes = job->step_layout->num_hosts; + r.nnodes = job->step_layout->node_cnt; r.nprocs = opt.nprocs; r.slurmd_debug = opt.slurmd_debug; r.switch_job = job->switch_job; @@ -168,13 +168,13 @@ launch(void *arg) /* Node specific message contents */ if (slurm_mpi_single_task_per_node ()) { - for (i = 0; i < job->step_layout->num_hosts; i++) + for (i = 0; i < job->step_layout->node_cnt; i++) job->step_layout->tasks[i] = 1; } r.tasks_to_launch = job->step_layout->tasks; r.global_task_ids = job->step_layout->tids; - r.cpus_allocated = job->step_layout->cpus; + r.cpus_allocated = job->step_layout->tasks; r.num_resp_port = job->njfds; r.resp_port = xmalloc(sizeof(uint16_t) * r.num_resp_port); @@ -193,19 +193,18 @@ launch(void *arg) buffer = slurm_pack_msg_no_header(&msg_array_ptr[0]); //hostlist = hostlist_create(job->nodelist); - debug("sending to list %s", job->step_layout->nodes); - hostlist = hostlist_create(job->step_layout->nodes); + debug("sending to list %s", job->step_layout->node_list); + hostlist = hostlist_create(job->step_layout->node_list); itr = hostlist_iterator_create(hostlist); job->thr_count = 0; i=0; - for(i=0; i<job->step_layout->num_hosts; i++) { + for(i=0; i<job->step_layout->node_cnt; i++) { host = hostlist_next(itr); - if(!job->step_layout->host[i] || !host) + if(!host) break; + debug("sending to %s id %d", host, i); + free(host); - //for (i = 0; i < job->step_layout->num_hosts; i++) { - debug("sending to %s %s %d %d", job->step_layout->host[i], - host, i, job->step_layout->num_hosts); m = &msg_array_ptr[job->thr_count]; m->srun_node_id = (uint32_t)i; @@ -222,11 +221,12 @@ launch(void *arg) forward_set_launch(&m->forward, span[job->thr_count], &i, - job->step_layout, + job->step_layout->node_cnt, + job->step_layout->node_addr, itr, opt.msg_timeout); job->thr_count++; - free(host); + } xfree(span); hostlist_iterator_destroy(itr); @@ -365,6 +365,7 @@ static void _p_launch(slurm_msg_t *req, srun_job_t *job) int i; thd_t *thd; int rc = 0; + char *name = NULL; /* * SigFunc *oldh; @@ -384,7 +385,10 @@ static void _p_launch(slurm_msg_t *req, srun_job_t *job) for (i = 0; i < job->thr_count; i++) { if (job->step_layout->tasks[i] == 0) { /* No tasks for this node */ - debug("Node %s is unused",job->step_layout->host[i]); + name = nodelist_nth_host(job->step_layout->node_list, + i); + debug("Node %s is unused",name); + free(name); job->host_state[i] = SRUN_HOST_REPLIED; thd[i].thread = (pthread_t) NULL; continue; @@ -407,7 +411,7 @@ static void _p_launch(slurm_msg_t *req, srun_job_t *job) _spawn_launch_thr(&thd[i]); } - for ( ; i < job->step_layout->num_hosts; i++) + for ( ; i < job->step_layout->node_cnt; i++) _update_failed_node(job, i); pthread_mutex_lock(&active_mutex); @@ -510,12 +514,16 @@ static void * _p_launch_task(void *arg) ListIterator data_itr; ret_types_t *ret_type = NULL; ret_data_info_t *ret_data_info = NULL; - + char *name = NULL; + th->state = DSH_ACTIVE; th->tstart = time(NULL); - if (_verbose) - _print_launch_msg(msg, job->step_layout->host[nodeid], nodeid); + if (_verbose) { + name = nodelist_nth_host(job->step_layout->node_list, nodeid); + _print_launch_msg(msg, name, nodeid); + free(name); + } again: //ret_list = slurm_send_recv_rc_msg(req, opt.msg_timeout); ret_list = slurm_send_recv_rc_packed_msg(req, opt.msg_timeout); @@ -534,12 +542,17 @@ again: ret_data_info->nodeid); continue; } + errno = ret_type->err; - if (errno != EINTR) + if (errno != EINTR) { + name = nodelist_nth_host( + job->step_layout->node_list, + ret_data_info->nodeid); verbose("first launch error on %s: %m", - job->step_layout-> - host[ret_data_info->nodeid]); - + name); + free(name); + } + if ((errno != ETIMEDOUT) && (job->state == SRUN_JOB_LAUNCHING) && (errno != ESLURMD_INVALID_JOB_CREDENTIAL) @@ -550,16 +563,14 @@ again: sleep(1); goto again; } - + name = nodelist_nth_host(job->step_layout->node_list, + ret_data_info->nodeid); + if (errno == EINTR) - verbose("launch on %s canceled", - job->step_layout-> - host[ret_data_info->nodeid]); + verbose("launch on %s canceled", name); else - error("second launch error on %s: %m", - job->step_layout-> - host[ret_data_info->nodeid]); - + error("second launch error on %s: %m", name); + free(name); _update_failed_node(job, ret_data_info->nodeid); th->state = DSH_FAILED; diff --git a/src/srun/msg.c b/src/srun/msg.c index beeb11ca5bd0c1e8e543c9cd7c32a499b4146b59..65aa79e0696c3482df50f0dec900be960279c1a1 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -40,6 +40,7 @@ #include <sys/poll.h> #include <sys/wait.h> #include <time.h> +#include <stdlib.h> #include <slurm/slurm_errno.h> @@ -149,10 +150,11 @@ static void _handle_update_mpir_proctable(int fd, srun_job_t *job) int len; char *executable = NULL; int i; - + char *name = NULL; + /* some initialization */ if (MPIR_proctable_size == 0) { - MPIR_proctable_size = job->step_layout->num_tasks; + MPIR_proctable_size = job->step_layout->task_cnt; MPIR_proctable = xmalloc(sizeof(MPIR_PROCDESC) * MPIR_proctable_size); totalview_jobid = NULL; @@ -174,6 +176,7 @@ static void _handle_update_mpir_proctable(int fd, srun_job_t *job) remote_argv[1] = NULL; } } + name = nodelist_nth_host(job->step_layout->node_list, nodeid); for (i = 0; i < ntasks; i++) { MPIR_PROCDESC *tv; int taskid, pid; @@ -182,15 +185,15 @@ static void _handle_update_mpir_proctable(int fd, srun_job_t *job) safe_read(fd, &pid, sizeof(int)); tv = &MPIR_proctable[taskid]; - tv->host_name = job->step_layout->host[nodeid]; + tv->host_name = xstrdup(name); tv->pid = pid; tv->executable_name = executable; tasks_recorded++; } - + free(name); /* if all tasks are now accounted for, set the debug state and call the Breakpoint */ - if (tasks_recorded == job->step_layout->num_tasks) { + if (tasks_recorded == job->step_layout->task_cnt) { if (opt.multi_prog) set_multi_name(ntasks); MPIR_debug_state = MPIR_DEBUG_SPAWNED; @@ -198,7 +201,7 @@ static void _handle_update_mpir_proctable(int fd, srun_job_t *job) if (opt.debugger_test) _dump_proctable(job); } - + return; rwfail: @@ -211,20 +214,14 @@ static void _update_step_layout(int fd, slurm_step_layout_t *layout, { int msg_type = PIPE_UPDATE_STEP_LAYOUT; int dummy = 0xdeadbeef; - int len = 0; safe_write(fd, &msg_type, sizeof(int)); /* read by par_thr() */ safe_write(fd, &dummy, sizeof(int)); /* read by par_thr() */ /* the rest are read by _handle_update_step_layout() */ safe_write(fd, &nodeid, sizeof(int)); - safe_write(fd, &layout->num_hosts, sizeof(uint32_t)); - safe_write(fd, &layout->num_tasks, sizeof(uint32_t)); - - len = strlen(layout->host[nodeid]) + 1; - safe_write(fd, &len, sizeof(int)); - safe_write(fd, layout->host[nodeid], len); - + safe_write(fd, &layout->node_cnt, sizeof(uint32_t)); + safe_write(fd, &layout->task_cnt, sizeof(uint32_t)); safe_write(fd, &layout->tasks[nodeid], sizeof(uint32_t)); safe_write(fd, layout->tids[nodeid], layout->tasks[nodeid]*sizeof(uint32_t)); @@ -238,26 +235,18 @@ rwfail: static void _handle_update_step_layout(int fd, slurm_step_layout_t *layout) { int nodeid; - int len = 0; - + safe_read(fd, &nodeid, sizeof(int)); - safe_read(fd, &layout->num_hosts, sizeof(uint32_t)); - safe_read(fd, &layout->num_tasks, sizeof(uint32_t)); - xassert(nodeid >= 0 && nodeid <= layout->num_tasks); + safe_read(fd, &layout->node_cnt, sizeof(uint32_t)); + safe_read(fd, &layout->task_cnt, sizeof(uint32_t)); + xassert(nodeid >= 0 && nodeid <= layout->task_cnt); /* If this is the first call to this function, then we probably need to intialize some of the arrays */ - if (layout->host == NULL) - layout->host = xmalloc(layout->num_hosts * sizeof(char *)); if (layout->tasks == NULL) - layout->tasks = xmalloc(layout->num_hosts * sizeof(uint32_t *)); + layout->tasks = xmalloc(layout->node_cnt * sizeof(uint32_t *)); if (layout->tids == NULL) - layout->tids = xmalloc(layout->num_hosts * sizeof(uint32_t *)); - - safe_read(fd, &len, sizeof(int)); - /*xassert(layout->host[nodeid] == NULL);*/ - layout->host[nodeid] = xmalloc(len); - safe_read(fd, layout->host[nodeid], len); + layout->tids = xmalloc(layout->node_cnt * sizeof(uint32_t *)); safe_read(fd, &layout->tasks[nodeid], sizeof(uint32_t)); xassert(layout->tids[nodeid] == NULL); @@ -274,12 +263,12 @@ rwfail: static void _dump_proctable(srun_job_t *job) { int node_inx, task_inx, taskid; - int num_tasks; + int task_cnt; MPIR_PROCDESC *tv; for (node_inx=0; node_inx<job->nhosts; node_inx++) { - num_tasks = job->step_layout->tasks[node_inx]; - for (task_inx = 0; task_inx < num_tasks; task_inx++) { + task_cnt = job->step_layout->tasks[node_inx]; + for (task_inx = 0; task_inx < task_cnt; task_inx++) { taskid = job->step_layout->tids[node_inx][task_inx]; tv = &MPIR_proctable[taskid]; if (!tv) @@ -543,13 +532,18 @@ static void _confirm_launch_complete(srun_job_t *job) { int i; + char *name = NULL; + printf("job->nhosts %d\n",job->nhosts); for (i=0; i<job->nhosts; i++) { printf("job->nhosts %d\n",job->nhosts); if (job->host_state[i] != SRUN_HOST_REPLIED) { + name = nodelist_nth_host(job->step_layout->node_list, + i); error ("Node %s not responding, terminating job step", - job->step_layout->host[i]); + name); + free(name); info("sending Ctrl-C to remaining tasks"); fwd_signal(job, SIGINT, opt.max_threads); job->rc = 124; @@ -616,7 +610,6 @@ _reattach_handler(srun_job_t *job, slurm_msg_t *msg) for (i = 0; i < resp->ntasks; i++) { job->step_layout->tids[resp->srun_node_id][i] = resp->gtids[i]; - job->step_layout->hostids[resp->gtids[i]] = resp->srun_node_id; info ("setting task%d on hostid %d\n", resp->gtids[i], resp->srun_node_id); } @@ -733,7 +726,7 @@ _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) int status = msg->return_code; int i; char buf[1024]; - + if (!(host = step_layout_host_name(job->step_layout, task0))) host = "Unknown host"; debug2("exited host %s", host); @@ -771,9 +764,9 @@ _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) update_job_state(job, SRUN_JOB_TERMINATED); } } - + update_tasks_state(job, step_layout_host_id(job->step_layout, task0)); - + _print_exit_status(job, hl, host, status); hostlist_destroy(hl); diff --git a/src/srun/reattach.c b/src/srun/reattach.c index 3695872c9b54ec7fd82d354b9e73c4abe029a734..045dd4e81b29d0b16f2f31eac8d18979134fc97a 100644 --- a/src/srun/reattach.c +++ b/src/srun/reattach.c @@ -252,7 +252,8 @@ _get_step_info(srun_step_t *s) xassert(s->stepid != NO_VAL); - if (slurm_get_job_steps((time_t) 0, s->jobid, s->stepid, &resp, 1) < 0) { + if (slurm_get_job_steps((time_t) 0, s->jobid, s->stepid, &resp, 1) + < 0) { error("Unable to get step information for %u.%u: %m", s->jobid, s->stepid); goto done; @@ -397,7 +398,7 @@ _p_reattach_task(void *arg) int rc = 0; reattach_tasks_request_msg_t *req = t->msg->data; int nodeid = req->srun_node_id; - char *host = t->job->step_layout->host[nodeid]; + char *host = nodelist_nth_host(t->job->step_layout->node_list, nodeid); t->state = THD_ACTIVE; debug3("sending reattach request to %s", host); @@ -411,7 +412,7 @@ _p_reattach_task(void *arg) t->state = THD_DONE; t->job->host_state[nodeid] = SRUN_HOST_UNREACHABLE; } - + free(host); slurm_mutex_lock(&active_mutex); active--; pthread_cond_signal(&active_cond); @@ -492,8 +493,8 @@ int reattach() job->client_io = client_io_handler_create( fds, - job->step_layout->num_tasks, - job->step_layout->num_hosts, + job->step_layout->task_cnt, + job->step_layout->node_cnt, sig, opt.labelio); if (!job->client_io diff --git a/src/srun/srun.c b/src/srun/srun.c index 0a98022b28134520a8082b483b38e165574e0589..0f9efdb935143ea617fa70b6900cc7be21f0e12f 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -193,6 +193,7 @@ int srun(int ac, char **av) _switch_standalone(job); } else if (opt.allocate) { + int cpu_cnt = 0, cpu_inx = 0, i; sig_setup_sigmask(); if ( !(resp = allocate_nodes()) ) exit(1); @@ -206,14 +207,25 @@ int srun(int ac, char **av) _print_job_information(resp); job = job_create_allocation(resp); - job->step_layout = step_layout_create(resp, NULL, NULL); - if(!job->step_layout) { - fatal("step_layout not created correctly"); - } - if(task_layout(job->step_layout, opt.nodelist, - opt.distribution) != SLURM_SUCCESS) { - fatal("problem with task layout"); + job->step_layout = xmalloc(sizeof(slurm_step_layout_t)); + job->step_layout->node_list = xstrdup(resp->node_list); + job->step_layout->node_cnt = resp->node_cnt; + job->step_layout->tasks = xmalloc(sizeof(uint32_t) + * resp->node_cnt); + job->step_layout->task_cnt = 0; + for (i=0; i<job->step_layout->node_cnt; i++) { + job->step_layout->tasks[i] = + resp->cpus_per_node[cpu_inx]; + job->step_layout->task_cnt += + job->step_layout->tasks[i]; + + if ((++cpu_cnt) >= resp->cpu_count_reps[cpu_inx]) { + /* move to next record */ + cpu_inx++; + cpu_cnt = 0; + } } + if (msg_thr_create(job) < 0) job_fatal(job, "Unable to create msg thread"); exitcode = _run_job_script(job, env); @@ -318,8 +330,8 @@ int srun(int ac, char **av) job->client_io = client_io_handler_create( fds, - job->step_layout->num_tasks, - job->step_layout->num_hosts, + job->step_layout->task_cnt, + job->step_layout->node_cnt, sig, opt.labelio); if (!job->client_io @@ -409,10 +421,10 @@ _task_count_string (srun_job_t *job) char *str = xstrdup (""); if(job->step_layout->tasks == NULL) return (str); - last_val = job->step_layout->cpus[0]; + last_val = job->step_layout->tasks[0]; last_cnt = 1; for (i=1; i<job->nhosts; i++) { - if (last_val == job->step_layout->cpus[i]) + if (last_val == job->step_layout->tasks[i]) last_cnt++; else { if (last_cnt > 1) @@ -420,7 +432,7 @@ _task_count_string (srun_job_t *job) else sprintf(tmp, "%d,", last_val); xstrcat(str, tmp); - last_val = job->step_layout->cpus[i]; + last_val = job->step_layout->tasks[i]; last_cnt = 1; } } diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c index d8dd490f8e496ab45f8c5ed4b09eca7fe64d9d6a..3de79d78d6f0850948fadf6b83c44c58641f0cea 100644 --- a/src/srun/srun_job.c +++ b/src/srun/srun_job.c @@ -96,7 +96,7 @@ job_create_noalloc(void) srun_job_t *job = NULL; allocation_info_t *ai = xmalloc(sizeof(*ai)); uint32_t cpn = 1; - + int i; hostlist_t hl = hostlist_create(opt.nodelist); if (!hl) { @@ -125,16 +125,17 @@ job_create_noalloc(void) * Create job, then fill in host addresses */ job = _job_create_structure(ai); - job->step_layout = step_layout_create(NULL, NULL, NULL); - job->step_layout->nodes = (char *)xstrdup(job->nodelist); - //job->step_layout->hl = hostlist_create(job->nodelist); - job->step_layout->cpus_per_node = ai->cpus_per_node; - job->step_layout->cpu_count_reps = ai->cpu_count_reps; - job->step_layout->num_hosts = job->nhosts; - job->step_layout->num_tasks = job->ntasks; - - task_layout(job->step_layout, opt.nodelist, opt.distribution); - + job->step_layout = xmalloc(sizeof(slurm_step_layout_t)); + job->step_layout->node_list = xstrdup(job->nodelist); + job->step_layout->node_cnt = ai->nnodes; + job->step_layout->tasks = xmalloc(sizeof(uint32_t)); + job->step_layout->task_cnt = 0; + for (i=0; i<job->step_layout->node_cnt; i++) + job->step_layout->tasks[i] = cpn; + + job->step_layout->node_cnt = job->nhosts; + job->step_layout->task_cnt = job->ntasks; + _job_fake_cred(job); job_update_io_fnames(job); @@ -447,10 +448,14 @@ void report_job_status(srun_job_t *job) { int i; + hostlist_t hl = hostlist_create(job->nodelist); + char *name = NULL; for (i = 0; i < job->nhosts; i++) { - info ("host:%s state:%s", job->step_layout->host[i], + name = hostlist_shift(hl); + info ("host:%s state:%s", name, _host_state_name(job->host_state[i])); + free(name); } }