diff --git a/NEWS b/NEWS index 6e5bb4eebb8201d97dca9691edfeea348b4e85b5..624af4ac6997a011d32b208ae3beee59d4bf7e4b 100644 --- a/NEWS +++ b/NEWS @@ -48,6 +48,8 @@ documents those changes that are of interest to users and admins. -- Added PMI timing information to srun debug mode to aid in tuning. Use "srun -vv ..." to see the information. -- Added checkpoint/ompi (OpenMPI) plugin (still under development). + -- Fix bug in load leveling logic added to v1.2.13 which can cause an + infinite loop and hang slurmctld when sharing nodes between jobs. * Changes in SLURM 1.2.13 ========================= diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 8660faf2e4e21240a43c2eb3713cdf143ac95a3b..646316289a00097d065f6f28be556cba80e718a1 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -289,24 +289,28 @@ _pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap, uint32_t req_nodes, bool test_only) { bitstr_t *basemap; - int i, error_code, node_cnt, prev_cnt = 0, equal = 0; - + int i, error_code = EINVAL, node_cnt = 0, prev_cnt = 0, set_cnt; + basemap = bit_copy(bitmap); if (basemap == NULL) fatal("bit_copy malloc failure"); - - for (i = 0; 1; i++) { + + set_cnt = bit_set_count(bitmap); + if ((set_cnt < min_nodes) || + ((req_nodes > min_nodes) && (set_cnt < req_nodes))) + return error_code; /* not usable */ + + for (i=0; node_cnt<set_cnt; i++) { node_cnt = _job_count_bitmap(basemap, bitmap, i); if ((node_cnt == 0) || (node_cnt == prev_cnt)) - continue; + continue; /* nothing new to test */ if ((node_cnt < min_nodes) || ((req_nodes > min_nodes) && (node_cnt < req_nodes))) - continue; - equal = bit_equal(basemap, bitmap); + continue; /* need more nodes */ error_code = select_g_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, test_only); - if (!error_code || equal) + if (!error_code) break; prev_cnt = node_cnt; }