diff --git a/NEWS b/NEWS index 50b0b5f764e9ee6e18f9d0dd0ad2fe999781ae85..ec1d64f3cb16d2daeaab9c92798de07a6102585e 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.2.0-pre2 ============================= + -- Fixed task dist to work with hostfile and warn about asking for more tasks + than you have nodes for in arbitray mode. * Changes in SLURM 1.2.0-pre1 ============================= diff --git a/src/common/dist_tasks.c b/src/common/dist_tasks.c index cf08c3f64781b0a926be0e10cc07ec5321058050..566191965f183de5dddc8f0e8cbd72e8a5ea72bf 100644 --- a/src/common/dist_tasks.c +++ b/src/common/dist_tasks.c @@ -169,18 +169,6 @@ extern slurm_step_layout_t *step_layout_create( job_step_create_request_msg_t *step_req) { slurm_step_layout_t *step_layout = NULL; - char *temp = NULL; - - /* - * Swap the step_req and step_resp node lists. - * (Why? I don't know. This really needs refactoring. - CJM) - * FIXME! - */ - if(step_req && step_req->node_list != NULL && step_resp) { - temp = step_req->node_list; - step_req->node_list = step_resp->node_list; - step_resp->node_list = temp; - } step_layout = xmalloc(sizeof(slurm_step_layout_t)); if(!step_layout) { @@ -213,34 +201,19 @@ extern slurm_step_layout_t *step_layout_create( } if(step_resp) { - step_layout->step_nodes = - (char *)xstrdup(step_resp->node_list); - if (step_layout->hl) - hostlist_destroy(step_layout->hl); - step_layout->hl = hostlist_create(step_resp->node_list); + /* set the node list for the task layout later if user + supplied could be different that the job allocation */ + step_layout->step_nodes = xstrdup(step_resp->node_list); +/* info("host list is %s", step_resp->node_list); */ } else { debug("no step_resp given for step_layout_create"); step_layout->step_nodes = NULL; } if(step_req) { - if (step_req->node_list != NULL) { - if(step_layout->hl) - hostlist_destroy(step_layout->hl); - step_layout->hl = hostlist_create(step_req->node_list); -#ifdef HAVE_FRONT_END /* Limited job step support */ - /* All jobs execute through front-end on Blue Gene. - * Normally we would not permit execution of job steps, - * but can fake it by just allocating all tasks to - * one of the allocated nodes. */ - step_layout->num_hosts = 1; -#else - step_layout->num_hosts = hostlist_count(step_layout->hl); -#endif - } else { - step_layout->num_hosts = step_req->node_count; - } - + /* this info is only in the step_req which needs to put + in the step_layout most likely the num_tasks set above + from the job allocation is incorrect now for the step */ step_layout->task_dist = step_req->task_dist; step_layout->num_tasks = step_req->num_tasks; } else { @@ -298,6 +271,12 @@ extern int task_layout(slurm_step_layout_t *step_layout) for (i=0; i<step_layout->num_hosts; i++) { step_layout->host[i] = hostlist_shift(step_layout->hl); + if(!step_layout->host[i]) { + error("hostlist incomplete for this job request"); + return SLURM_ERROR; + } + + debug2("host %d = %s", i, step_layout->host[i]); step_layout->cpus[i] = step_layout->cpus_per_node[cpu_inx]; if ((++cpu_cnt) >= step_layout->cpu_count_reps[cpu_inx]) { /* move to next record */ @@ -305,7 +284,7 @@ extern int task_layout(slurm_step_layout_t *step_layout) cpu_cnt = 0; } } - + if (step_layout->task_dist == SLURM_DIST_CYCLIC) return _task_layout_cyclic(step_layout); #ifndef HAVE_FRONT_END @@ -342,25 +321,38 @@ step_layout_host_name (slurm_step_layout_t *s, int taskid) */ static int _task_layout_hostfile(slurm_step_layout_t *step_layout) { - int i=0, j, taskid = 0; + int i=0, j, taskid = 0, task_cnt=0; hostlist_iterator_t itr = NULL, itr_task = NULL; char *host = NULL; char *host_task = NULL; hostlist_t job_alloc_hosts = NULL; hostlist_t step_alloc_hosts = NULL; + debug3("job list is %s", step_layout->alloc_nodes); job_alloc_hosts = hostlist_create(step_layout->alloc_nodes); itr = hostlist_iterator_create(job_alloc_hosts); + debug3("list is %s", step_layout->step_nodes); step_alloc_hosts = hostlist_create(step_layout->step_nodes); + if(hostlist_count(step_alloc_hosts) != step_layout->num_tasks) { + error("Asked for %d tasks have %d in the nodelist. " + "Check your nodelist", + step_layout->num_tasks, + hostlist_count(step_alloc_hosts)); + return SLURM_ERROR; + } itr_task = hostlist_iterator_create(step_alloc_hosts); while((host = hostlist_next(itr))) { step_layout->tasks[i] = 0; while((host_task = hostlist_next(itr_task))) { - if(!strcmp(host, host_task)) + if(!strcmp(host, host_task)) { step_layout->tasks[i]++; + task_cnt++; + } free(host_task); + if(task_cnt >= step_layout->num_tasks) + break; } - debug2("%s got %d tasks\n", + debug3("%s got %d tasks\n", host, step_layout->tasks[i]); if(step_layout->tasks[i] == 0) @@ -378,17 +370,25 @@ static int _task_layout_hostfile(slurm_step_layout_t *step_layout) } taskid++; free(host_task); + if(j >= step_layout->tasks[i]) + break; } i++; reset_hosts: hostlist_iterator_reset(itr_task); free(host); + if(i > step_layout->num_tasks) + break; } hostlist_iterator_destroy(itr); hostlist_iterator_destroy(itr_task); hostlist_destroy(job_alloc_hosts); hostlist_destroy(step_alloc_hosts); - + if(task_cnt != step_layout->num_tasks) { + error("Asked for %d tasks but placed %d. Check your nodelist", + step_layout->num_tasks, task_cnt); + return SLURM_ERROR; + } return SLURM_SUCCESS; } #endif diff --git a/src/common/global_srun.h b/src/common/global_srun.h index 21ee0d12b30a96581340774c9639404d6efd5081..a8e186400a4e8bb023128e59c6a8b61775a489cc 100644 --- a/src/common/global_srun.h +++ b/src/common/global_srun.h @@ -143,6 +143,7 @@ typedef struct srun_job { } srun_job_t; + void fwd_signal(srun_job_t *job, int signal, int max_threads); int job_active_tasks_on_host(srun_job_t *job, int hostid); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 4d99d516e0e0ff750fdaa953f26ab4d3839c8480..c0dbb4d35113fee16540d45b75784a7a05d040d9 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1359,7 +1359,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run, error_code = _job_create(job_specs, allocate, will_run, &job_ptr, submit_uid); *job_pptr = job_ptr; - + if (error_code) { if (immediate && job_ptr) { job_ptr->job_state = JOB_FAILED; @@ -1411,6 +1411,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run, (!top_prio) || (!independent); error_code = select_nodes(job_ptr, no_alloc); + if ((error_code == ESLURM_NODES_BUSY) || (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) { /* Not fatal error, but job can't be scheduled right now */ diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index dc7fcae3df3a4d428bf4b25a261e64a99947411d..c8e2a3e27bb046787cb56a52b0c2d74e8cfb8b39 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -661,8 +661,8 @@ step_create(job_step_create_request_msg_t *step_specs, step_ptr->step_node_list = xstrdup(step_specs->node_list); else step_ptr->step_node_list = bitmap2node_name(nodeset); - xfree(step_specs->node_list); - step_specs->node_list = bitmap2node_name(nodeset); + //xfree(step_specs->node_list); + //step_specs->node_list = bitmap2node_name(nodeset); step_ptr->step_node_bitmap = nodeset; step_ptr->cyclic_alloc = (uint16_t) (step_specs->task_dist == SLURM_DIST_CYCLIC); diff --git a/src/srun/allocate.c b/src/srun/allocate.c index a6806cc44177b5890f773d00fce7e21f7d6f48cc..3f5d55512ffe84424e939c5d390cfef47eb80595 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -117,7 +117,7 @@ allocate_nodes(void) if(!resp) goto done; - + if ((rc == 0) && (resp->node_list == NULL)) { if (resp->error_code) info("Warning: %s", slurm_strerror(resp->error_code)); @@ -134,7 +134,7 @@ allocate_nodes(void) xfree(resp->node_list); resp->node_list = xstrdup(j->req_nodes); } - + done: xsignal_set_mask(&oset); xsignal(SIGINT, ointf); @@ -410,6 +410,8 @@ job_desc_msg_create_from_opts (char *script) { extern char **environ; job_desc_msg_t *j = xmalloc(sizeof(*j)); + char buf[8192]; + hostlist_t hl = NULL; slurm_init_job_desc_msg(j); @@ -417,7 +419,7 @@ job_desc_msg_create_from_opts (char *script) j->features = opt.constraints; j->immediate = opt.immediate; j->name = opt.job_name; - j->req_nodes = opt.nodelist; + j->req_nodes = xstrdup(opt.nodelist); if (j->req_nodes == NULL) { char *nodelist = NULL; char *hostfile = getenv("SLURM_HOSTFILE"); @@ -436,6 +438,21 @@ job_desc_msg_create_from_opts (char *script) } } } + /* simplify the job allocation nodelist, + not laying out tasks until step */ + if(j->req_nodes) { + hl = hostlist_create(j->req_nodes); + hostlist_ranged_string(hl, sizeof(buf), buf); + xfree(opt.nodelist); + opt.nodelist = xstrdup(buf); + hostlist_uniq(hl); + hostlist_ranged_string(hl, sizeof(buf), buf); + hostlist_destroy(hl); + + xfree(j->req_nodes); + j->req_nodes = xstrdup(buf); + } + if(opt.distribution == SLURM_DIST_ARBITRARY && !j->req_nodes) { error("With Arbitrary distribution you need to " @@ -561,7 +578,8 @@ _step_req_create(srun_job_t *j) r->cpu_count = opt.overcommit ? j->nhosts : (opt.nprocs*opt.cpus_per_task); r->num_tasks = opt.nprocs; - r->node_list = xstrdup(j->nodelist); + r->node_list = xstrdup(opt.nodelist); + debug("requesting nodes %s", r->node_list); r->network = xstrdup(opt.network); r->name = xstrdup(opt.job_name); r->relative = false; /* XXX fix this oneday */ diff --git a/src/srun/launch.c b/src/srun/launch.c index 3dd4693dcb9804543041f7bfd90b914f26d7affa..f69df632c244e14a82fea835818182aa83c91713 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -197,6 +197,8 @@ launch(void *arg) itr = hostlist_iterator_create(hostlist); job->thr_count = 0; for (i = 0; i < job->step_layout->num_hosts; i++) { + debug2("sending to %s %d %d", job->step_layout->host[i], + i, job->step_layout->num_hosts); if(!job->step_layout->host[i]) break; slurm_msg_t *m = &msg_array_ptr[job->thr_count]; @@ -385,16 +387,15 @@ static void _p_launch(slurm_msg_t *req, srun_job_t *job) * Set job timeout to maximum launch time + current time */ job->ltimeout = time(NULL) + opt.max_launch_time; - thd = xmalloc (job->thr_count * sizeof (thd_t)); for (i = 0; i < job->thr_count; i++) { - /* if (job->step_layout->tasks[i] == 0) { */ -/* /\* No tasks for this node *\/ */ -/* debug("Node %s is unused",job->step_layout->host[i]); */ -/* job->host_state[i] = SRUN_HOST_REPLIED; */ -/* thd[i].thread = (pthread_t) NULL; */ -/* continue; */ -/* } */ + if (job->step_layout->tasks[i] == 0) { + /* No tasks for this node */ + debug("Node %s is unused",job->step_layout->host[i]); + job->host_state[i] = SRUN_HOST_REPLIED; + thd[i].thread = (pthread_t) NULL; + continue; + } if (job->state > SRUN_JOB_LAUNCHING) break;