diff --git a/NEWS b/NEWS index f1add3d6be0d01f51aba7e474e985b9a139e8364..6abda7c53995af37e27ac7df70c754e8c2827a16 100644 --- a/NEWS +++ b/NEWS @@ -35,6 +35,11 @@ documents those changes that are of interest to users and administrators. we have the pids added to the system correctly. -- Add support for job dependencies with job array expressions. Previous logic required listing each task of job array individually. + -- Make sure tres_cnt is set before creating a slurmdb_assoc_usage_t. + -- Prevent backfill scheduler from starting a second "singleton" job if another + one started during a backfill sleep. + -- Fix for invalid array pointer when creating advanced reservation when job + allocations span heterogeneous nodes (differing core or socket counts). * Changes in Slurm 16.05.1 ========================== @@ -352,6 +357,8 @@ documents those changes that are of interest to users and administrators. -- MYSQL - Fix issue with adding a reservation if the name has single quotes in it. -- Correctly print ranges when using step values in job arrays. + -- Fix for invalid array pointer when creating advanced reservation when job + allocations span heterogeneous nodes (differing core or socket counts). * Changes in Slurm 15.08.12 =========================== diff --git a/doc/html/slurm_ug_cfp.shtml b/doc/html/slurm_ug_cfp.shtml index ebb925ca98f3c21b5b208825fc4e8dc4f166bbe7..30ca1da411c96cd67e1e8badee39e3a198773d1a 100644 --- a/doc/html/slurm_ug_cfp.shtml +++ b/doc/html/slurm_ug_cfp.shtml @@ -33,7 +33,7 @@ or tutorial about Slurm is invited to send an abstract to </p> <p><b>Program Committee:</b><br> -Vangelis Floros (GRNET) +Vangelis Floros (GRNET)<br> Yiannis Georgiou (Bull)<br> Brian Gilmer (Cray)<br> Matthieu Hautreux (CEA)<br> diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index 707063badd139332db2e04a9714afae5a45b73f4..6502619a0fcd4a5efb4bc2f63d9bb9ca2be38759 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -726,7 +726,7 @@ static slurmdb_assoc_rec_t* _find_assoc_parent( } /* locks should be put in place before calling this function - * ASSOC_WRITE, USER_WRITE */ + * ASSOC_WRITE, USER_WRITE, TRES_READ */ static int _set_assoc_parent_and_user(slurmdb_assoc_rec_t *assoc, int reset) { @@ -906,7 +906,7 @@ static void _set_children_level_shares(slurmdb_assoc_rec_t *assoc, /* transfer slurmdb assoc list to be assoc_mgr assoc list */ /* locks should be put in place before calling this function - * ASSOC_WRITE, USER_WRITE */ + * ASSOC_WRITE, USER_WRITE, TRES_READ */ static int _post_assoc_list(void) { slurmdb_assoc_rec_t *assoc = NULL; diff --git a/src/common/slurmdb_defs.c b/src/common/slurmdb_defs.c index a42ba4dfe25f3d4784373f346e8f5076fc3a0c19..f7e37f25908003d27303e95bbe0a2821f80f6e85 100644 --- a/src/common/slurmdb_defs.c +++ b/src/common/slurmdb_defs.c @@ -583,8 +583,14 @@ extern slurmdb_step_rec_t *slurmdb_create_step_rec() extern slurmdb_assoc_usage_t *slurmdb_create_assoc_usage(int tres_cnt) { - slurmdb_assoc_usage_t *usage = - xmalloc(sizeof(slurmdb_assoc_usage_t)); + slurmdb_assoc_usage_t *usage; + int alloc_size; + + if (!tres_cnt) + fatal("%s: You need to give a tres_cnt to call this function", + __func__); + + usage = xmalloc(sizeof(slurmdb_assoc_usage_t)); usage->level_shares = NO_VAL; usage->shares_norm = NO_VAL64; @@ -594,13 +600,13 @@ extern slurmdb_assoc_usage_t *slurmdb_create_assoc_usage(int tres_cnt) usage->level_fs = 0; usage->fs_factor = 0; - if (tres_cnt) { - int alloc_size = sizeof(uint64_t) * tres_cnt; - usage->tres_cnt = tres_cnt; - usage->grp_used_tres = xmalloc(alloc_size); - usage->grp_used_tres_run_secs = xmalloc(alloc_size); - usage->usage_tres_raw = xmalloc(sizeof(long double) * tres_cnt); - } + usage->tres_cnt = tres_cnt; + + alloc_size = sizeof(uint64_t) * tres_cnt; + usage->grp_used_tres = xmalloc(alloc_size); + usage->grp_used_tres_run_secs = xmalloc(alloc_size); + + usage->usage_tres_raw = xmalloc(sizeof(long double) * tres_cnt); return usage; } diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 0bddb5fe42684f842664ae25cdfbf81d9b2b2cb0..73ae1875bf95daeb98e6b476921bc7d13020e16b 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -1343,6 +1343,11 @@ next_task: continue; if (!avail_front_end(job_ptr)) continue; /* No available frontend */ + if (!job_independent(job_ptr, 0)) { + /* No longer independent + * (e.g. another singleton started) */ + continue; + } job_ptr->time_limit = save_time_limit; job_ptr->part_ptr = part_ptr; diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index bcb27197f3cb06a70e7fddeeb0d9d37cc01771ce..f26f3d900cd4377e098d515fbebfb1589befbf05 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -3958,7 +3958,7 @@ static void _check_job_compatibility(struct job_record *job_ptr, { uint32_t total_nodes; bitstr_t *full_node_bitmap; - int i_core, i_node; + int i_core, i_node, res_inx; int start = 0; int rep_count = 0; job_resources_t *job_res = job_ptr->job_resrcs; @@ -3984,9 +3984,8 @@ static void _check_job_compatibility(struct job_record *job_ptr, i_node = 0; while (i_node < total_nodes) { - int cores_in_a_node = (job_res->sockets_per_node[i_node] * - job_res->cores_per_socket[i_node]); - + int cores_in_a_node = (job_res->sockets_per_node[res_inx] * + job_res->cores_per_socket[res_inx]); int repeat_node_conf = job_res->sock_core_rep_count[rep_count++]; int node_bitmap_inx; @@ -3997,6 +3996,7 @@ static void _check_job_compatibility(struct job_record *job_ptr, #endif i_node += repeat_node_conf; + res_inx++; while (repeat_node_conf--) { int allocated;