diff --git a/NEWS b/NEWS index c0798f9f81f3b4a32a05ef62c645ec09dc579e88..ff88b93016b19ca93d1eb8cd180d8b180521ae43 100644 --- a/NEWS +++ b/NEWS @@ -86,6 +86,8 @@ documents those changes that are of interest to users and admins. - Fix bug when changing the time limit of a running job that has previously been suspended (formerly failed to account for suspend time in setting termination time). + - fix for step allocation to be able to specify only a few nodes in a + step and ask for more that specified. * Changes in SLURM 1.1.13 ========================= diff --git a/src/common/env.c b/src/common/env.c index ded9a70f4eb3de9964e0fbd6ceff0b3d0296451d..99b88c822d0300f2cc0eea5de94f42e7019895aa 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -16,7 +16,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index fd82fb1bce1c64c6b95d532cf0ba18e1a469ceee..c50f652e741c6c30d6614f678c4cea8fd12059e4 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -5,7 +5,8 @@ ***************************************************************************** * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Morris Jette <jette@llnl.gov>, Kevin Tew <tew1@llnl.gov>,et. al. + * Written by Morris Jette <jette@llnl.gov>, Kevin Tew + * <tew1@llnl.gov>, et. al. * UCRL-CODE-217948. * * This file is part of SLURM, a resource management program. diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 509547081c0822cdb2152224d05209f5bc46fccf..d5b013ba0c8cbdac744e3b69db41fe92c82e98ff 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -16,7 +16,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -452,17 +452,21 @@ _pick_step_nodes (struct job_record *job_ptr, xfree(step_spec->node_list); step_spec->task_dist = SLURM_DIST_BLOCK; FREE_NULL_BITMAP(selected_nodes); - } else { - /* use selected nodes to run the job */ - FREE_NULL_BITMAP(nodes_avail); - return selected_nodes; } - } else { - /* set the nodes_avail to be the new set */ - FREE_NULL_BITMAP(nodes_avail); - nodes_avail = selected_nodes; step_spec->node_count = bit_set_count(nodes_avail); } + if (selected_nodes) { + /* use selected nodes to run the job and + * make them unavailable for future use */ + nodes_picked = bit_copy(selected_nodes); + bit_not(selected_nodes); + bit_and(nodes_avail, selected_nodes); + bit_free(selected_nodes); + } + } else { + nodes_picked = bit_alloc(bit_size(nodes_avail)); + if (nodes_picked == NULL) + fatal("bit_alloc malloc failure"); } if (step_spec->relative != (uint16_t)NO_VAL) { @@ -480,13 +484,9 @@ _pick_step_nodes (struct job_record *job_ptr, bit_not (relative_nodes); bit_and (nodes_avail, relative_nodes); bit_free (relative_nodes); - nodes_picked = bit_alloc(bit_size(nodes_avail)); - if ((nodes_picked == NULL)) - fatal("bit_alloc malloc failure"); } else { - nodes_picked = bit_alloc(bit_size(nodes_avail)); - nodes_idle = bit_alloc(bit_size(nodes_avail)); - if ((nodes_picked == NULL) || (nodes_idle == NULL)) + nodes_idle = bit_alloc (bit_size (nodes_avail) ); + if (nodes_idle == NULL) fatal("bit_alloc malloc failure"); step_iterator = list_iterator_create(job_ptr->step_list); @@ -710,6 +710,7 @@ step_create(job_step_create_request_msg_t *step_specs, fatal ("create_step_record failed with no memory"); /* set the step_record values */ + /* Here is where the node list is set for the step */ if(step_specs->node_list && step_specs->task_dist == SLURM_DIST_ARBITRARY) { @@ -720,6 +721,7 @@ step_create(job_step_create_request_msg_t *step_specs, step_node_list = bitmap2node_name(nodeset); step_specs->node_list = xstrdup(step_node_list); } + step_ptr->step_node_bitmap = nodeset; step_ptr->cyclic_alloc = (uint16_t) (step_specs->task_dist == SLURM_DIST_CYCLIC); diff --git a/src/srun/allocate.c b/src/srun/allocate.c index db22359fdc25278b2f482d772d4dd3424fdea2fc..66fd2217eebaefc667a4213d394b0861b8cf25d8 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -16,7 +16,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -102,6 +102,7 @@ allocate_nodes(void) sigset_t oset; resource_allocation_response_msg_t *resp = NULL; job_desc_msg_t *j = job_desc_msg_create_from_opts (NULL); + if(!j) return NULL; @@ -132,21 +133,11 @@ allocate_nodes(void) if ((rc == 0) && (resp->node_list == NULL)) { if (resp->error_code) - verbose("Warning: %s", slurm_strerror(resp->error_code)); + verbose("Warning: %s", + slurm_strerror(resp->error_code)); _wait_for_resources(&resp); } - /* For diagnosing a node problem, administrators need to sometimes - * run a job on N nodes one of which must be the node believed to - * have a problem (e.g. "srun -N4 -w bad_node diagnostic"). The - * below logic prevents this from working and necessiates the - * admin identify four specific nodes to use for the above test - * instead of just the one bad node. Otherwise only the one - * bad node is used in the job's allocation. */ - if(resp->node_list && j->req_nodes) { - xfree(resp->node_list); - resp->node_list = xstrdup(j->req_nodes); - } - + done: xsignal_set_mask(&oset); xsignal(SIGINT, ointf); diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c index 1f1c72d1634b320b0350b8dcbc298b1e1afe707e..91c09d359734e0649d3f00b6f4e1c9087ca8d7f9 100644 --- a/src/srun/srun_job.c +++ b/src/srun/srun_job.c @@ -215,8 +215,8 @@ job_step_create_allocation(uint32_t job_id) hostlist_destroy(hl); xfree(opt.nodelist); opt.nodelist = xstrdup(buf); - xfree(ai->nodelist); - ai->nodelist = xstrdup(buf); + /* xfree(ai->nodelist); */ +/* ai->nodelist = xstrdup(buf); */ } if(opt.nodelist) { @@ -228,11 +228,12 @@ job_step_create_allocation(uint32_t job_id) hostlist_ranged_string(hl, sizeof(buf), buf); count = hostlist_count(hl); hostlist_destroy(hl); - xfree(ai->nodelist); - ai->nodelist = xstrdup(buf); + /* xfree(ai->nodelist); */ +/* ai->nodelist = xstrdup(buf); */ xfree(opt.nodelist); opt.nodelist = xstrdup(buf); } + if(opt.distribution == SLURM_DIST_ARBITRARY) { if(count != opt.nprocs) { error("You asked for %d tasks but specified %d nodes", @@ -308,14 +309,16 @@ job_step_create_allocation(uint32_t job_id) } /* get the correct number of hosts to run tasks on */ - if(opt.nodelist) { - hl = hostlist_create(opt.nodelist); - hostlist_uniq(hl); - ai->nnodes = hostlist_count(hl); - hostlist_destroy(hl); - } else if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) + /* if(opt.nodelist) { */ +/* hl = hostlist_create(opt.nodelist); */ +/* hostlist_uniq(hl); */ +/* ai->nnodes = hostlist_count(hl); */ +/* hostlist_destroy(hl); */ +/* } else */ + if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; - +/* info("looking for %d nodes out of %s with a must list of %s", */ +/* ai->nnodes, ai->nodelist, opt.nodelist); */ /* * Create job */ @@ -334,7 +337,7 @@ job_create_allocation(resource_allocation_response_msg_t *resp) { srun_job_t *job; allocation_info_t *i = xmalloc(sizeof(*i)); - + i->nodelist = _normalize_hostlist(resp->node_list); i->nnodes = resp->node_cnt; i->jobid = resp->job_id;