diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 962a2467573ef5ade133babf4e6bc12d62d0e195..6f835a78d22f6688a0670806d3a998bb18c380ae 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -478,23 +478,24 @@ typedef struct job_info_msg { } job_info_msg_t; typedef struct slurm_step_layout { - char *nodes; - char *arbitrary_nodes; - char **host; /* name for each host */ - - slurm_addr *node_addr; - - uint32_t *cpus_per_node; - uint32_t *cpu_count_reps; - uint32_t *cpus; /* count of processors on each host */ + char *nodes; /* list of nodes in step */ + char **host; /* list separated for convience sake + created from nodes */ + slurm_addr *node_addr; /* corisponding addresses */ + uint16_t num_hosts; /* node count */ + uint32_t num_tasks; /* number of tasks to execute */ + + uint16_t num_cpu_groups;/* count of cpu reps */ + uint32_t *cpus_per_node; /* consolidated list of cpus per node */ + uint32_t *cpu_count_reps; /* how many host each cpus per + node refers to */ + uint32_t *tasks; /* number of tasks on each host */ uint32_t **tids; /* host id => task id mapping */ uint32_t *hostids; /* task id => host id mapping */ - - uint16_t num_hosts; /* node count */ - uint32_t num_tasks; /* number of tasks to execute */ - uint16_t task_dist; /* see enum task_dist_state */ + uint32_t *cpus; /* total count of processors on each host */ + } slurm_step_layout_t; typedef struct job_step_specs { diff --git a/src/common/dist_tasks.c b/src/common/dist_tasks.c index b62b0f4cf5421c1c86e0c55ba50ab69df54cac26..52c307a93cb59b0a350c0cd97368510d18cdd9d1 100644 --- a/src/common/dist_tasks.c +++ b/src/common/dist_tasks.c @@ -55,18 +55,35 @@ static int _task_layout_block(slurm_step_layout_t *step_layout); static int _task_layout_cyclic(slurm_step_layout_t *step_layout); #ifndef HAVE_FRONT_END -static int _task_layout_hostfile(slurm_step_layout_t *step_layout); +static int _task_layout_hostfile(slurm_step_layout_t *step_layout, + const char *arbitrary_nodes); #endif -extern slurm_step_layout_t *_step_layout_create( + +/* + * distribute_tasks - determine how many tasks of a job will be run on each. + * node. Distribution is influenced by number of cpus on + * each host. + * IN mlist - hostlist corresponding to cpu arrays + * IN num_cpu_groups - elements in below cpu arrays + * IN cpus_per_node - cpus per node + * IN cpu_count_reps - how many nodes have same cpu count + * IN tlist - hostlist of nodes on which to distribute tasks + * IN num_tasks - number of tasks to distribute across these cpus + * RET a pointer to an integer array listing task counts per node + * NOTE: allocates memory that should be xfreed by caller + */ +slurm_step_layout_t *distribute_tasks( const char *mlist, const char *tlist, - uint32_t *cpus_per_node, uint32_t *cpu_count_reps, - uint32_t num_hosts, uint32_t num_tasks, uint16_t task_dist) + uint32_t *cpus_per_node, uint32_t *cpu_count_reps, + uint16_t num_cpu_groups, + uint16_t num_hosts, uint32_t num_tasks, + uint16_t task_dist) { - slurm_step_layout_t *step_layout = NULL; - - step_layout = xmalloc(sizeof(slurm_step_layout_t)); + char *arbitrary_nodes = NULL; + slurm_step_layout_t *step_layout = + xmalloc(sizeof(slurm_step_layout_t)); if(!step_layout) { error("xmalloc error for step_layout"); return NULL; @@ -77,7 +94,7 @@ extern slurm_step_layout_t *_step_layout_create( char buf[8192]; /* set the node list for the task layout later if user supplied could be different that the job allocation */ - step_layout->arbitrary_nodes = xstrdup(tlist); + arbitrary_nodes = xstrdup(tlist); hl = hostlist_create(tlist); hostlist_uniq(hl); @@ -87,11 +104,20 @@ extern slurm_step_layout_t *_step_layout_create( step_layout->nodes = xstrdup(buf); } else { step_layout->nodes = xstrdup(tlist); - step_layout->arbitrary_nodes = NULL; } - step_layout->cpus_per_node = cpus_per_node; - step_layout->cpu_count_reps = cpu_count_reps; - step_layout->task_dist = task_dist; + + step_layout->num_cpu_groups = num_cpu_groups; + + step_layout->cpu_count_reps = + xmalloc(sizeof(uint32_t) * num_cpu_groups); + memcpy(step_layout->cpu_count_reps, cpu_count_reps, + (sizeof(uint32_t) * num_cpu_groups)); + + step_layout->cpus_per_node = + xmalloc(sizeof(uint32_t) * num_cpu_groups); + memcpy(step_layout->cpus_per_node, cpus_per_node, + (sizeof(uint32_t) * num_cpu_groups)); + step_layout->num_tasks = num_tasks; #ifdef HAVE_FRONT_END /* Limited job step support */ @@ -103,36 +129,9 @@ extern slurm_step_layout_t *_step_layout_create( #else step_layout->num_hosts = num_hosts; #endif - - return step_layout; -} - -/* - * distribute_tasks - determine how many tasks of a job will be run on each. - * node. Distribution is influenced by number of cpus on - * each host. - * IN mlist - hostlist corresponding to cpu arrays - * IN num_cpu_groups - elements in below cpu arrays - * IN cpus_per_node - cpus per node - * IN cpu_count_reps - how many nodes have same cpu count - * IN tlist - hostlist of nodes on which to distribute tasks - * IN num_tasks - number of tasks to distribute across these cpus - * RET a pointer to an integer array listing task counts per node - * NOTE: allocates memory that should be xfreed by caller - */ -slurm_step_layout_t *distribute_tasks( - const char *mlist, const char *tlist, - uint32_t *cpus_per_node, uint32_t *cpu_count_reps, - uint32_t num_hosts, uint32_t num_tasks, - uint16_t task_dist) -{ - slurm_step_layout_t *step_layout = NULL; - step_layout = _step_layout_create(mlist, tlist, - cpus_per_node, cpu_count_reps, - num_hosts, num_tasks, - task_dist); - if(task_layout(step_layout) == SLURM_ERROR) { + if(task_layout(step_layout, arbitrary_nodes, task_dist) + == SLURM_ERROR) { step_layout_destroy(step_layout); step_layout = NULL; } @@ -191,7 +190,6 @@ extern slurm_step_layout_t *step_layout_create( /* this info is only in the step_req which needs to put in the step_layout most likely the num_tasks set above from the job allocation is incorrect now for the step */ - step_layout->task_dist = step_req->task_dist; step_layout->num_tasks = step_req->num_tasks; } else { debug("no step_req given for step_layout_create"); @@ -210,28 +208,42 @@ extern slurm_step_layout_t *step_layout_copy(slurm_step_layout_t *step_layout) layout = xmalloc(sizeof(slurm_step_layout_t)); layout->nodes = xstrdup(step_layout->nodes); - layout->arbitrary_nodes = xstrdup(step_layout->arbitrary_nodes); layout->num_hosts = step_layout->num_hosts; layout->num_tasks = step_layout->num_tasks; + layout->num_cpu_groups = step_layout->num_cpu_groups; layout->node_addr = xmalloc(sizeof(slurm_addr) * layout->num_hosts); memcpy(layout->node_addr, step_layout->node_addr, (sizeof(slurm_addr) * layout->num_hosts)); + layout->cpus = xmalloc(sizeof(uint32_t) * layout->num_hosts); memcpy(layout->cpus, step_layout->cpus, (sizeof(uint32_t) * layout->num_hosts)); + layout->tasks = xmalloc(sizeof(uint32_t) * layout->num_hosts); memcpy(layout->tasks, step_layout->tasks, (sizeof(uint32_t) * layout->num_hosts)); + layout->hostids = xmalloc(sizeof(uint32_t) * layout->num_tasks); memcpy(layout->hostids, step_layout->hostids, (sizeof(uint32_t) * layout->num_tasks)); + layout->cpus_per_node = xmalloc(sizeof(uint32_t) + * layout->num_cpu_groups); + memcpy(layout->cpus_per_node, step_layout->cpus_per_node, + (sizeof(uint32_t) * layout->num_cpu_groups)); + + layout->cpu_count_reps = xmalloc(sizeof(uint32_t) + * layout->num_cpu_groups); + memcpy(layout->cpu_count_reps, step_layout->cpu_count_reps, + (sizeof(uint32_t) * layout->num_cpu_groups)); + layout->host = xmalloc(sizeof(char *) * layout->num_hosts); layout->tids = xmalloc(sizeof(uint32_t *) * layout->num_hosts); for (i=0; i<layout->num_hosts; i++) { layout->host[i] = malloc(strlen(step_layout->host[i])); strcpy(layout->host[i], step_layout->host[i]); + layout->tids[i] = xmalloc(sizeof(uint32_t) * layout->tasks[i]); memcpy(layout->tids[i], step_layout->tids[i], (sizeof(uint32_t) * layout->tasks[i])); @@ -246,7 +258,6 @@ extern int step_layout_destroy(slurm_step_layout_t *step_layout) int i=0; if(step_layout) { xfree(step_layout->nodes); - xfree(step_layout->arbitrary_nodes); xfree(step_layout->node_addr); for (i=0; i<step_layout->num_hosts; i++) { if(step_layout->host && step_layout->host[i]) @@ -258,6 +269,8 @@ extern int step_layout_destroy(slurm_step_layout_t *step_layout) xfree(step_layout->cpus); xfree(step_layout->tasks); xfree(step_layout->hostids); + xfree(step_layout->cpus_per_node); + xfree(step_layout->cpu_count_reps); xfree(step_layout); } @@ -266,10 +279,12 @@ extern int step_layout_destroy(slurm_step_layout_t *step_layout) } /* build maps for task layout on nodes */ -extern int task_layout(slurm_step_layout_t *step_layout) +extern int task_layout(slurm_step_layout_t *step_layout, + const char *arbitrary_nodes, + uint16_t task_dist) { int cpu_cnt = 0, cpu_inx = 0, i; - hostlist_t hl; + hostlist_t hl = NULL; if (step_layout->num_hosts == 0) return SLURM_ERROR; @@ -300,6 +315,7 @@ extern int task_layout(slurm_step_layout_t *step_layout) step_layout->nodes); if(step_layout->num_hosts < 1) { error("no hostlist given can't layout tasks"); + hostlist_destroy(hl); return SLURM_ERROR; } for (i=0; i<step_layout->num_hosts; i++) { @@ -314,6 +330,7 @@ extern int task_layout(slurm_step_layout_t *step_layout) debug2("host %d = %s", i, step_layout->host[i]); step_layout->cpus[i] = step_layout->cpus_per_node[cpu_inx]; + if ((++cpu_cnt) >= step_layout->cpu_count_reps[cpu_inx]) { /* move to next record */ cpu_inx++; @@ -322,11 +339,16 @@ extern int task_layout(slurm_step_layout_t *step_layout) } hostlist_destroy(hl); - if (step_layout->task_dist == SLURM_DIST_CYCLIC) + if(step_layout->num_cpu_groups != cpu_inx) { + info("we got %d cpu groups but was looking for %d", + cpu_inx, step_layout->num_cpu_groups); + } + + if (task_dist == SLURM_DIST_CYCLIC) return _task_layout_cyclic(step_layout); #ifndef HAVE_FRONT_END - else if(step_layout->task_dist == SLURM_DIST_ARBITRARY) - return _task_layout_hostfile(step_layout); + else if(task_dist == SLURM_DIST_ARBITRARY) + return _task_layout_hostfile(step_layout, arbitrary_nodes); #endif else return _task_layout_block(step_layout); @@ -356,7 +378,8 @@ step_layout_host_name (slurm_step_layout_t *s, int taskid) /* use specific set run tasks on each host listed in hostfile * XXX: Need to handle over-subscribe. */ -static int _task_layout_hostfile(slurm_step_layout_t *step_layout) +static int _task_layout_hostfile(slurm_step_layout_t *step_layout, + const char *arbitrary_nodes) { int i=0, j, taskid = 0, task_cnt=0; hostlist_iterator_t itr = NULL, itr_task = NULL; @@ -368,8 +391,8 @@ static int _task_layout_hostfile(slurm_step_layout_t *step_layout) debug2("job list is %s", step_layout->nodes); job_alloc_hosts = hostlist_create(step_layout->nodes); itr = hostlist_iterator_create(job_alloc_hosts); - debug2("list is %s", step_layout->arbitrary_nodes); - step_alloc_hosts = hostlist_create(step_layout->arbitrary_nodes); + debug2("list is %s", arbitrary_nodes); + step_alloc_hosts = hostlist_create(arbitrary_nodes); if(hostlist_count(step_alloc_hosts) != step_layout->num_tasks) { error("Asked for %d tasks have %d in the nodelist. " "Check your nodelist", diff --git a/src/common/dist_tasks.h b/src/common/dist_tasks.h index de9b7ca7cd3107689535d3ffd9ce258237aef21e..b54769d97ad2bcc361f31f93cc7377661923f0f6 100644 --- a/src/common/dist_tasks.h +++ b/src/common/dist_tasks.h @@ -63,7 +63,8 @@ slurm_step_layout_t *distribute_tasks(const char *mlist, const char *tlist, uint32_t *cpus_per_node, uint32_t *cpu_count_reps, - uint32_t num_hosts, + uint16_t num_cpu_groups, + uint16_t num_hosts, uint32_t num_tasks, uint16_t task_dist); @@ -76,7 +77,8 @@ extern slurm_step_layout_t *step_layout_copy(slurm_step_layout_t *step_layout); /* destroys structure for step layout */ extern int step_layout_destroy(slurm_step_layout_t *step_layout); /* build maps for task layout on nodes */ -extern int task_layout(slurm_step_layout_t *step_layout); +extern int task_layout(slurm_step_layout_t *step_layout, + const char *arbitrary_nodes, uint16_t task_dist); extern int step_layout_host_id (slurm_step_layout_t *s, int taskid); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index ee2f2571d837a71a5d445cd5d527913f93d03354..a445a3465301111165ed79c22e87cf0f9e32490f 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -3246,15 +3246,20 @@ _pack_slurm_step_layout(slurm_step_layout_t *step_layout, Buf buffer) packstr(step_layout->nodes, buffer); pack16(step_layout->num_hosts, buffer); pack32(step_layout->num_tasks, buffer); + pack16(step_layout->num_cpu_groups, buffer); _pack_slurm_addr_array(step_layout->node_addr, step_layout->num_hosts, buffer); - pack32_array(step_layout->cpus, step_layout->num_hosts, buffer); + + pack32_array(step_layout->cpus_per_node, + step_layout->num_cpu_groups, buffer); + pack32_array(step_layout->cpu_count_reps, + step_layout->num_cpu_groups, buffer); + pack32_array(step_layout->tasks, step_layout->num_hosts, buffer); pack32_array(step_layout->hostids, step_layout->num_tasks, buffer); for(i=0; i<step_layout->num_hosts; i++) { pack32_array(step_layout->tids[i], step_layout->tasks[i], buffer); - packstr(step_layout->host[i], buffer); } } @@ -3263,24 +3268,26 @@ _unpack_slurm_step_layout(slurm_step_layout_t **layout, Buf buffer) { uint16_t uint16_tmp; uint32_t uint32_tmp; - int i; + int cpu_cnt = 0, cpu_inx = 0, i; slurm_step_layout_t *step_layout; + hostlist_t hl = NULL; step_layout = xmalloc(sizeof(slurm_step_layout_t)); *layout = step_layout; step_layout->nodes = NULL; - step_layout->arbitrary_nodes = NULL; step_layout->num_hosts = 0; step_layout->host = NULL; step_layout->tids = NULL; step_layout->cpus = NULL; step_layout->tasks = NULL; step_layout->hostids = NULL; - + step_layout->cpus_per_node = NULL; + step_layout->cpu_count_reps = NULL; safe_unpackstr_xmalloc(&step_layout->nodes, &uint16_tmp, buffer); safe_unpack16(&step_layout->num_hosts, buffer); safe_unpack32(&step_layout->num_tasks, buffer); + safe_unpack16(&step_layout->num_cpu_groups, buffer); if (_unpack_slurm_addr_array(&(step_layout->node_addr), &uint16_tmp, buffer)) @@ -3288,9 +3295,14 @@ _unpack_slurm_step_layout(slurm_step_layout_t **layout, Buf buffer) if (uint16_tmp != step_layout->num_hosts) goto unpack_error; - safe_unpack32_array(&(step_layout->cpus), + safe_unpack32_array(&(step_layout->cpus_per_node), &uint32_tmp, buffer); - if (uint32_tmp != step_layout->num_hosts) + if (uint32_tmp != step_layout->num_cpu_groups) + goto unpack_error; + + safe_unpack32_array(&(step_layout->cpu_count_reps), + &uint32_tmp, buffer); + if (uint32_tmp != step_layout->num_cpu_groups) goto unpack_error; safe_unpack32_array(&(step_layout->tasks), @@ -3307,18 +3319,39 @@ _unpack_slurm_step_layout(slurm_step_layout_t **layout, Buf buffer) * step_layout->num_hosts); step_layout->host = xmalloc(sizeof(char *) * step_layout->num_hosts); + step_layout->cpus = xmalloc(sizeof(uint32_t) + * step_layout->num_hosts); + + hl = hostlist_create(step_layout->nodes); for(i=0; i<step_layout->num_hosts; i++) { safe_unpack32_array(&(step_layout->tids[i]), &uint32_tmp, buffer); if (uint32_tmp != step_layout->tasks[i]) goto unpack_error; - safe_unpackstr_malloc(&step_layout->host[i], &uint16_tmp, - buffer); + + step_layout->host[i] = hostlist_shift(hl); + if(!step_layout->host[i]) { + error("hostlist incomplete for this job request"); + goto unpack_error; + } + step_layout->cpus[i] = step_layout->cpus_per_node[cpu_inx]; + + if ((++cpu_cnt) >= step_layout->cpu_count_reps[cpu_inx]) { + /* move to next record */ + cpu_inx++; + cpu_cnt = 0; + } } + hostlist_destroy(hl); + if(step_layout->num_cpu_groups != cpu_inx) { + error("we got %d cpu groups but was looking for %d", + cpu_inx, step_layout->num_cpu_groups); + } return SLURM_SUCCESS; unpack_error: + hostlist_destroy(hl); step_layout_destroy(step_layout); *layout = NULL; return SLURM_ERROR; diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 605caff6791050019c2d7ea659503509db0aecbf..c35ab7fdb188efc43bcd10c94a0c1a4337e5441f 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -725,6 +725,7 @@ step_create(job_step_create_request_msg_t *step_specs, step_ptr->step_node_list, job_ptr->cpus_per_node, job_ptr->cpu_count_reps, + job_ptr->num_cpu_groups, step_specs->node_count, step_ptr->num_tasks, step_specs->task_dist); diff --git a/src/srun/srun.c b/src/srun/srun.c index feaba67f9d495e452a32e2a3ea981ecd1a205885..0a98022b28134520a8082b483b38e165574e0589 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -210,7 +210,8 @@ int srun(int ac, char **av) if(!job->step_layout) { fatal("step_layout not created correctly"); } - if(task_layout(job->step_layout) != SLURM_SUCCESS) { + if(task_layout(job->step_layout, opt.nodelist, + opt.distribution) != SLURM_SUCCESS) { fatal("problem with task layout"); } if (msg_thr_create(job) < 0) diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c index bbd4595e2e53a190b6c0b075376cfb37dffbf6db..d8dd490f8e496ab45f8c5ed4b09eca7fe64d9d6a 100644 --- a/src/srun/srun_job.c +++ b/src/srun/srun_job.c @@ -127,14 +127,13 @@ job_create_noalloc(void) job = _job_create_structure(ai); job->step_layout = step_layout_create(NULL, NULL, NULL); job->step_layout->nodes = (char *)xstrdup(job->nodelist); - job->step_layout->arbitrary_nodes = NULL; //job->step_layout->hl = hostlist_create(job->nodelist); job->step_layout->cpus_per_node = ai->cpus_per_node; job->step_layout->cpu_count_reps = ai->cpu_count_reps; job->step_layout->num_hosts = job->nhosts; job->step_layout->num_tasks = job->ntasks; - task_layout(job->step_layout); + task_layout(job->step_layout, opt.nodelist, opt.distribution); _job_fake_cred(job); job_update_io_fnames(job); diff --git a/testsuite/expect/test1.42 b/testsuite/expect/test1.42 index 84ecc1b936167adeb6c987c51f003267b9fd0eed..f79fd4bc19628dd7d8910aa572bb12df5d592a97 100755 --- a/testsuite/expect/test1.42 +++ b/testsuite/expect/test1.42 @@ -82,6 +82,7 @@ if {$job_id1 == 0} { # set match_acct 0 set match_state 0 +set timeout 30 spawn $srun -v --dependency=$job_id1 $scontrol show job $job_id1 expect { -re "launching ($number).0" { diff --git a/testsuite/expect/test1.84 b/testsuite/expect/test1.84 index d347d2ab7c768f6e46acf41a4f8a89392e926428..e3ba5028c4cee652c6f8610aba047ea4fd88e8e2 100755 --- a/testsuite/expect/test1.84 +++ b/testsuite/expect/test1.84 @@ -46,6 +46,10 @@ if { [test_front_end] } { send_user "\nWARNING: This test is incompatable with FRONT_END systems\n" exit 0 } +if {[test_multiple_slurmd] != 0} { + send_user "\nWARNING: This test is incompatable with multiple slurmd systems\n" + exit 0 +} # # Submit a 1 node job to determine the node's CPU count diff --git a/testsuite/expect/test1.86 b/testsuite/expect/test1.86 index 12c63f5b4dac42a664651978d92a9f6e01763c02..96ea964a72a2d5f7a24978066a496a0cbdc13897 100755 --- a/testsuite/expect/test1.86 +++ b/testsuite/expect/test1.86 @@ -45,6 +45,11 @@ if {[test_front_end] != 0} { exit 0 } +if {[test_multiple_slurmd] != 0} { + send_user "\nWARNING: This test is incompatable with multiple slurmd systems\n" + exit 0 +} + # # Build input script file # diff --git a/testsuite/expect/test7.3.prog.c b/testsuite/expect/test7.3.prog.c index ae62c99ad400762b4a908a30adc6b996a9876201..8814bdfe208a5c3227b477a69a771d18888aa933 100644 --- a/testsuite/expect/test7.3.prog.c +++ b/testsuite/expect/test7.3.prog.c @@ -46,7 +46,6 @@ int main (int argc, char *argv[]) job_desc_msg_t job_req; resource_allocation_response_msg_t *job_resp; job_step_create_request_msg_t step_req; - old_job_alloc_msg_t old_alloc; slurm_step_ctx ctx = NULL; char *task_argv[3]; char cwd[128]; @@ -83,11 +82,11 @@ int main (int argc, char *argv[]) (strlen(job_resp->node_list) == 0)) { printf("Waiting for resource allocation\n"); fflush(stdout); - old_alloc.job_id = job_resp->job_id; while ((job_resp->node_list == NULL) || (strlen(job_resp->node_list) == 0)) { sleep(5); - if (slurm_confirm_allocation(&old_alloc, &job_resp) && + if (slurm_allocation_lookup_lite(job_resp->job_id, + &job_resp) && (slurm_get_errno() != ESLURM_JOB_PENDING)) { slurm_perror("slurm_confirm_allocation"); exit(0);