diff --git a/NEWS b/NEWS index 1f7b52d9a2af998d0d927c38c7ca20071eab907f..36cd9f19cb6ea858487a100b6e75a8016ea24c28 100644 --- a/NEWS +++ b/NEWS @@ -235,6 +235,7 @@ documents those changes that are of interest to users and admins. calls resulting in slurm_xfree() Error: from read_config.c:642 - BLUEGENE - Put back logic to make a block fail a boot 3 times before cancelling a users job. + - Fix problem using srun --exclude option for a job step. * Changes in SLURM 1.1.26 ========================= diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 2a5c598d180b85e571864d8100714b14512f0314..3e965ed3fea5dc03ff86dce7247be06a5084738c 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -630,7 +630,7 @@ _step_req_create(srun_job_t *j) r->name = xstrdup(opt.job_name); r->relative = (uint16_t)opt.relative; r->overcommit = opt.overcommit ? 1 : 0; - debug("requesting job %d, user %d, nodes %d (%s)", + debug("requesting job %d, user %d, nodes %d including (%s)", r->job_id, r->user_id, r->node_count, r->node_list); debug("cpus %d, tasks %d, name %s, relative %d", r->cpu_count, r->num_tasks, r->name, r->relative); diff --git a/src/srun/srun.c b/src/srun/srun.c index 31ffcfad4baf550cb1e132cd1fb6397359870989..459030c9deed131c01333d988e657aa363ec4ee3 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -258,27 +258,6 @@ int srun(int ac, char **av) if (opt.alloc_nodelist == NULL) opt.alloc_nodelist = xstrdup(resp->node_list); - /* - * XXX: Kludgy fix to make sure job structure is created - * with the correct number of nodes. We reset opt.min_nodes - * here if it is not already set to simulate the - * user explicitly using -N or SLURM_NNODES. - * - * First we see if the user has already set the nodes. - * If not and the processes were set then we use - * that number as the min nodes if not then we use - * the number returned from the controller as the - * number of nodes to run on. I am not sure there is - * any other way to set these var's correctly - */ - - if (!opt.nodes_set) { - if(opt.nprocs_set) - opt.min_nodes = opt.nprocs; - else - opt.min_nodes = resp->node_cnt; - opt.nodes_set = true; - } slurm_free_resource_allocation_response_msg(resp); if (opt.allocate) { error("job %u already has an allocation", diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c index 6ac0cad64a285e919b839785ea1d39edb620e615..98f13f786bb114c8f9d0962dd584a950a52a88ba 100644 --- a/src/srun/srun_job.c +++ b/src/srun/srun_job.c @@ -169,13 +169,6 @@ job_step_create_allocation(uint32_t job_id) ai->jobid = job_id; ai->stepid = NO_VAL; - if(!opt.max_nodes) - opt.max_nodes = opt.min_nodes; - - /* The reason we read in from the hostfile here is so if we don't - * need all the hostfile we only get what the user asked for - * (i.e. opt.max_nodes) - */ if (opt.nodelist == NULL) { char *nodelist = NULL; char *hostfile = getenv("SLURM_HOSTFILE"); @@ -196,7 +189,12 @@ job_step_create_allocation(uint32_t job_id) } } } - ai->nodelist = opt.alloc_nodelist; + + ai->nodelist = opt.alloc_nodelist; + hl = hostlist_create(ai->nodelist); + hostlist_uniq(hl); + ai->nnodes = hostlist_count(hl); + hostlist_destroy(hl); if (opt.exc_nodes) { hostlist_t exc_hl = hostlist_create(opt.exc_nodes); @@ -211,6 +209,7 @@ job_step_create_allocation(uint32_t job_id) if (inx >= 0) { debug("excluding node %s", node_name); hostlist_delete_nth(hl, inx); + ai->nnodes--; /* decrement node count */ } free(node_name); } @@ -223,24 +222,46 @@ job_step_create_allocation(uint32_t job_id) hostlist_destroy(hl); xfree(opt.nodelist); opt.nodelist = xstrdup(buf); + /* Don't reset the ai->nodelist because that is the + * nodelist we want to say the allocation is under + * opt.nodelist is what is used for the allocation. + */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ } + /* get the correct number of hosts to run tasks on */ if(opt.nodelist) { hl = hostlist_create(opt.nodelist); + hostlist_uniq(hl); if(!hostlist_count(hl)) { - error("1 Hostlist is now nothing! Can't run job."); + error("Hostlist is now nothing! Can not run job."); return NULL; } hostlist_ranged_string(hl, sizeof(buf), buf); count = hostlist_count(hl); hostlist_destroy(hl); + /* Don't reset the ai->nodelist because that is the + * nodelist we want to say the allocation is under + * opt.nodelist is what is used for the allocation. + */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ xfree(opt.nodelist); opt.nodelist = xstrdup(buf); + } + + if (!opt.nodes_set) { + if(opt.nprocs_set) + opt.min_nodes = opt.nprocs; + else + opt.min_nodes = ai->nnodes; + opt.nodes_set = true; } + if(!opt.max_nodes) + opt.max_nodes = opt.min_nodes; + if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) + ai->nnodes = opt.max_nodes; if(opt.distribution == SLURM_DIST_ARBITRARY) { if(count != opt.nprocs) { @@ -250,10 +271,6 @@ job_step_create_allocation(uint32_t job_id) } } - hl = hostlist_create(ai->nodelist); - hostlist_uniq(hl); - ai->nnodes = hostlist_count(hl); - hostlist_destroy(hl); if (ai->nnodes == 0) { error("No nodes in allocation, can't run job"); goto error;