diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index b970700bf12fba4cbbe90e64c48214b1e13b80d8..628a87f6ef24ee89a7c1177f82a451d59ce95ce5 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1440,9 +1440,8 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run, no_alloc = test_only || too_fragmented || (!top_prio) || (!independent); - error_code = select_nodes(job_ptr, no_alloc); - + if ((error_code == ESLURM_NODES_BUSY) || (error_code == ESLURM_JOB_HELD) || (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) { diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 209520ad8452e9959186776c0cb7a75381e4f311..f876860c6164cc49e6d4b9173865908cbd5d8dfb 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -20,7 +20,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -30,17 +30,6 @@ * version. If you delete this exception statement from all source files in * the program, then also delete it here. * - * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under - * certain conditions as described in each individual source file, and - * distribute linked combinations including the two. You must obey the GNU - * General Public License in all respects for all of the code used other than - * OpenSSL. If you modify file(s) with this exception, you may extend this - * exception to your version of the file(s), but you are not obligated to do - * so. If you do not wish to do so, delete this exception statement from your - * version. If you delete this exception statement from all source files in - * the program, then also delete it here. - * * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 2846e4cefacc87acd7f9f5b8e30a014cfa39d68a..0c97fe1cc6ccf32f5224e471236cb8b2b4c309c4 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -293,10 +293,10 @@ _pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap, { bitstr_t *no_load_bit, *light_load_bit, *heavy_load_bit; int error_code; - + _node_load_bitmaps(bitmap, &no_load_bit, &light_load_bit, &heavy_load_bit); - + /* first try to use idle nodes */ bit_and(bitmap, no_load_bit); FREE_NULL_BITMAP(no_load_bit); @@ -462,7 +462,7 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, bool runable_avail = false; /* Job can run with available nodes */ int cr_enabled = 0; int shared = 0; - + if (node_set_size == 0) { info("_pick_best_nodes: empty node set for selection"); return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; @@ -580,12 +580,18 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, if (node_set_ptr[i].feature < min_feature) min_feature = node_set_ptr[i].feature; } - + for (j = min_feature; j <= max_feature; j++) { + /* we use this var to go straight down the list if the + * first one doesn't work we go to the next until the + * list is empty. + */ + int tries = 0; for (i = 0; i < node_set_size; i++) { bool pick_light_load = false; if (node_set_ptr[i].feature != j) continue; + if (!runable_ever) { int cr_disabled = 0; error_code = _add_node_set_info( @@ -603,6 +609,7 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, } } bit_and(node_set_ptr[i].my_bitmap, avail_node_bitmap); + if (cr_enabled) { bit_and(node_set_ptr[i].my_bitmap, partially_idle_node_bitmap); @@ -633,7 +640,7 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, } node_set_ptr[i].nodes = bit_set_count(node_set_ptr[i].my_bitmap); - error_code = _add_node_set_info(&node_set_ptr[i], + error_code = _add_node_set_info(&node_set_ptr[i], &avail_bitmap, &avail_nodes, &avail_cpus, @@ -672,6 +679,7 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, req_nodes, false); } + if (pick_code == SLURM_SUCCESS) { if (bit_set_count(avail_bitmap) > max_nodes) { /* end of tests for this feature */ @@ -682,8 +690,17 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, if (cr_enabled) FREE_NULL_BITMAP( partially_idle_node_bitmap); + *select_bitmap = avail_bitmap; return SLURM_SUCCESS; + } else { + /* reset the counters and start from the + * next node in the list */ + FREE_NULL_BITMAP(avail_bitmap); + avail_nodes = 0; + avail_cpus = 0; + tries++; + i = tries; } } @@ -941,8 +958,9 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only) error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; } else { error_code = _pick_best_nodes(node_set_ptr, node_set_size, - &select_bitmap, job_ptr, part_ptr, - min_nodes, max_nodes, req_nodes); + &select_bitmap, job_ptr, + part_ptr, min_nodes, max_nodes, + req_nodes); } if (error_code) { diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 7da3508678a5fa23321be3f2c1b682dad0d38ea2..b7dc4e168a695bdb1c0f7485804547867049982c 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -17,7 +17,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -104,7 +104,7 @@ inline static void _slurm_rpc_job_will_run(slurm_msg_t * msg); inline static void _slurm_rpc_node_registration(slurm_msg_t * msg); inline static void _slurm_rpc_node_select_info(slurm_msg_t * msg); inline static void _slurm_rpc_job_alloc_info(slurm_msg_t * msg); -inline static void _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg); +inline static void _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg); inline static void _slurm_rpc_ping(slurm_msg_t * msg); inline static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg); inline static void _slurm_rpc_requeue(slurm_msg_t * msg);