diff --git a/NEWS b/NEWS index e761db3e944407c95c08e97fbe259763d03ad660..c562ae225b36ae3270bc9d73b13ef00e923bdd7f 100644 --- a/NEWS +++ b/NEWS @@ -69,7 +69,11 @@ documents those changes that are of interest to users and admins. -- BLUEGENE - more robust checking for states when freeing blocks. -- Added correct files to the slurm.spec file for correct perl api rpm creation. - + -- Added flag "NoReserve" to a QOS to make it so all jobs are created equal + within a QOS. So if larger, higher priority jobs are unable to run they + don't prevent smaller jobs from running even if running the smaller + jobs delay the start of the larger, higher priority jobs. + * Changes in SLURM 2.2.0 ======================== -- Change format of Duration field in "scontrol show reservation" output from diff --git a/doc/man/man1/sacctmgr.1 b/doc/man/man1/sacctmgr.1 index 5a7d419244883038fa28539894360e41cf916c3e..88c01be6beed2b38c6f8110c1b0f93f0d25d6584 100644 --- a/doc/man/man1/sacctmgr.1 +++ b/doc/man/man1/sacctmgr.1 @@ -900,6 +900,14 @@ If set, and the QOS also has a UsageThreshold, any jobs submitted with this QOS that fall below the UsageThreshold will be held until their Fairshare Usage goes above the Threshold. .TP +\fINoReserve\fP +If set, and using backfill, jobs using this QOS will all be considered +at the same level within this QOS meaning if a larger, higher priority +job is unable to run a smaller job will run if possible even if the +larger higher priority job will be delayed starting. +NOTE: This could cause starvation on these larger jobs, but if that is +ok, this flag most likely will increase utilization. +.TP \fIPartitionMaxNodes\fP If set jobs using this QOS will be able to override the requested partition's MaxNodes limit. diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 5eca517ebd77f7c02877a55b4a2b76d103ebbb34..eaa68c19ed1019207b3579b25a25f3b950774038 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -477,7 +477,11 @@ round\-robin fashion). For example, consider an allocation of three nodes each with two cpus. A four\-task cyclic distribution request will distribute those tasks to the nodes with tasks one and four on the first node, task two on the second node, and task three on the -third node. Cyclic distribution is the default behavior if the number +third node. +Note that when SelectType is select/cons_res the same number of CPUs +may not be allocated on each node. Distribution will be round\-robin +among all the nodes with CPUs yet to be allocated. +Cyclic distribution is the default behavior if the number of tasks is no larger than the number of allocated nodes. .TP .B plane diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 55948ec0d2a361c29c335aa1c250ac6c56803e86..4fcb3fd7393db6cb7ac0e9c22f8c5b50c6f91040 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -536,7 +536,11 @@ round\-robin fashion). For example, consider an allocation of three nodes each with two cpus. A four\-task cyclic distribution request will distribute those tasks to the nodes with tasks one and four on the first node, task two on the second node, and task three on the -third node. Cyclic distribution is the default behavior if the number +third node. +Note that when SelectType is select/cons_res the same number of CPUs +may not be allocated on each node. Distribution will be round\-robin +among all the nodes with CPUs yet to be allocated. +Cyclic distribution is the default behavior if the number of tasks is no larger than the number of allocated nodes. .TP .B plane diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index d9b9d9edd33177e5e7ee8e90fc0915d9c3580adc..e6ac541edc8beac8242788ecebfeb67b5b979e0a 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -533,7 +533,11 @@ round\-robin fashion). For example, consider an allocation of three nodes each with two cpus. A four\-task cyclic distribution request will distribute those tasks to the nodes with tasks one and four on the first node, task two on the second node, and task three on the -third node. Cyclic distribution is the default behavior if the number +third node. +Note that when SelectType is select/cons_res the same number of CPUs +may not be allocated on each node. Distribution will be round\-robin +among all the nodes with CPUs yet to be allocated. +Cyclic distribution is the default behavior if the number of tasks is no larger than the number of allocated nodes. .TP .B plane diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index e2614172b54b05cfc680b9d5ac1e28e7a157434d..8f601212e9e1d7a212220da9dc00ddb3ba97136a 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1382,7 +1382,8 @@ scheduling module "sched/backfill" (see \fBSchedulerType\fR). .TP \fBSchedulerTimeSlice\fR -Number of seconds in each time slice when \fBSchedulerType=sched/gang\fR. +Number of seconds in each time slice when gang scheduling is enabled +(\fBPreemptMode=GANG\fR). The default value is 30 seconds. .TP diff --git a/slurm/slurmdb.h b/slurm/slurmdb.h index 61a2c17833fcca4f840818a769a14d92914211ce..0e9b3c839b133dae48d45a732a8ac9ab6aa42440 100644 --- a/slurm/slurmdb.h +++ b/slurm/slurmdb.h @@ -141,6 +141,7 @@ typedef enum { #define QOS_FLAG_PART_MAX_NODE 0x00000002 #define QOS_FLAG_PART_TIME_LIMIT 0x00000004 #define QOS_FLAG_ENFORCE_USAGE_THRES 0x00000008 +#define QOS_FLAG_NO_RESERVE 0x00000010 /* Archive / Purge time flags */ #define SLURMDB_PURGE_BASE 0x0000ffff /* Apply to get the number diff --git a/src/common/slurmdb_defs.c b/src/common/slurmdb_defs.c index 8f6cc183c0533c8766fd36ce37c3416a2ed3ce8a..e198386b25bdd41a74f6d7b57716dbc809a95bfd 100644 --- a/src/common/slurmdb_defs.c +++ b/src/common/slurmdb_defs.c @@ -311,6 +311,9 @@ static uint32_t _str_2_qos_flags(char *flags) if (slurm_strcasestr(flags, "PartitionTimeLimit")) return QOS_FLAG_PART_TIME_LIMIT; + if (slurm_strcasestr(flags, "NoReserve")) + return QOS_FLAG_NO_RESERVE; + return 0; } @@ -1188,6 +1191,8 @@ extern char *slurmdb_qos_flags_str(uint32_t flags) xstrcat(qos_flags, "Remove,"); if (flags & QOS_FLAG_ENFORCE_USAGE_THRES) xstrcat(qos_flags, "EnforceUsageThreshold,"); + if (flags & QOS_FLAG_NO_RESERVE) + xstrcat(qos_flags, "NoReserve,"); if (flags & QOS_FLAG_PART_MAX_NODE) xstrcat(qos_flags, "PartitionMaxNodes,"); if (flags & QOS_FLAG_PART_MIN_NODE) diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index fc68c78434b4b18662dfd7af9ef3f52fa09b47c5..62b7fee2df04d526b3fd93365e0d03fdc7bd3a35 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -128,8 +128,8 @@ static int _num_feature_count(struct job_record *job_ptr); static void _reset_job_time_limit(struct job_record *job_ptr, time_t now, node_space_map_t *node_space); static int _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap); -static bool _test_resv_overlap(node_space_map_t *node_space, - bitstr_t *use_bitmap, uint32_t start_time, +static bool _test_resv_overlap(node_space_map_t *node_space, + bitstr_t *use_bitmap, uint32_t start_time, uint32_t end_reserve); static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, uint32_t min_nodes, uint32_t max_nodes, @@ -445,10 +445,10 @@ static int _yield_locks(void) _my_sleep(backfill_interval); lock_slurmctld(all_locks); - if ((last_job_update == job_update) && - (last_node_update == node_update) && + if ((last_job_update == job_update) && + (last_node_update == node_update) && (last_part_update == part_update) && - (! stop_backfill) && (! config_flag)) + (! stop_backfill) && (! config_flag)) return 0; else return 1; @@ -459,6 +459,7 @@ static int _attempt_backfill(void) bool filter_root = false; List job_queue; job_queue_rec_t *job_queue_rec; + slurmdb_qos_rec_t *qos_ptr = NULL; int i, j, node_space_recs; struct job_record *job_ptr; struct part_record *part_ptr; @@ -499,7 +500,7 @@ static int _attempt_backfill(void) if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); - while ((job_queue_rec = (job_queue_rec_t *) + while ((job_queue_rec = (job_queue_rec_t *) list_pop_bottom(job_queue, sort_job_queue2))) { job_ptr = job_queue_rec->job_ptr; part_ptr = job_queue_rec->part_ptr; @@ -645,7 +646,7 @@ static int _attempt_backfill(void) if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ job_ptr->time_limit = comp_time_limit; - job_ptr->end_time = job_ptr->start_time + + job_ptr->end_time = job_ptr->start_time + (comp_time_limit * 60); _reset_job_time_limit(job_ptr, now, node_space); @@ -684,18 +685,21 @@ static int _attempt_backfill(void) } end_reserve = job_ptr->start_time + (time_limit * 60); - if (_test_resv_overlap(node_space, avail_bitmap, + if (_test_resv_overlap(node_space, avail_bitmap, job_ptr->start_time, end_reserve)) { /* This job overlaps with an existing reservation for - * job to be backfill scheduled, which the sched + * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ later_start = job_ptr->start_time; goto TRY_LATER; } /* - * Add reservation to scheduling table + * Add reservation to scheduling table if appropriate */ + qos_ptr = job_ptr->qos_ptr; + if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) + continue; bit_not(avail_bitmap); _add_reservation(job_ptr->start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); @@ -765,7 +769,7 @@ static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap) return rc; } -/* Reset a job's time limit (and end_time) as high as possible +/* Reset a job's time limit (and end_time) as high as possible * within the range job_ptr->time_min and job_ptr->time_limit. * Avoid using resources reserved for pending jobs or in resource * reservations */ @@ -881,8 +885,8 @@ static void _add_reservation(uint32_t start_time, uint32_t end_reserve, * IN start_time - start time of job * IN end_reserve - end time of job */ -static bool _test_resv_overlap(node_space_map_t *node_space, - bitstr_t *use_bitmap, uint32_t start_time, +static bool _test_resv_overlap(node_space_map_t *node_space, + bitstr_t *use_bitmap, uint32_t start_time, uint32_t end_reserve) { bool overlap = false; diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index 126ab37aa2938677a959a3a2427f3acf2806770e..81aa16116edb40dd5e320db3cc5f997250dabba6 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -305,7 +305,7 @@ static bg_record_t *_find_matching_block(List block_list, if (bg_record->job_ptr) bg_record->job_running = bg_record->job_ptr->job_id; - /*block is messed up some how (BLOCK_ERROR_STATE) + /* block is messed up some how (BLOCK_ERROR_STATE) * ignore it or if state == RM_PARTITION_ERROR */ if ((bg_record->job_running == BLOCK_ERROR_STATE) || (bg_record->state == RM_PARTITION_ERROR)) { @@ -314,25 +314,39 @@ static bg_record_t *_find_matching_block(List block_list, "state (can't use)", bg_record->bg_block_id); continue; - } else if ((bg_record->job_running != NO_JOB_RUNNING) - && (bg_record->job_running != job_ptr->job_id) - && ((bg_conf->layout_mode == LAYOUT_DYNAMIC) - || ((!SELECT_IS_CHECK_FULL_SET(query_mode) - || SELECT_IS_MODE_RUN_NOW(query_mode)) - && (bg_conf->layout_mode - != LAYOUT_DYNAMIC)))) { - /* Look here if you are trying to run now or - if you aren't looking at the full set. We - don't continue on running blocks for the - full set because we are seeing if the job - can ever run so look here. - */ - if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) - info("block %s in use by %s job %d", - bg_record->bg_block_id, - bg_record->user_name, - bg_record->job_running); - continue; + } else if ((bg_conf->layout_mode == LAYOUT_DYNAMIC) + || ((!SELECT_IS_CHECK_FULL_SET(query_mode) + || SELECT_IS_MODE_RUN_NOW(query_mode)) + && (bg_conf->layout_mode != LAYOUT_DYNAMIC))) { + if (bg_record->free_cnt) { + /* No reason to look at a block that + is being freed unless we are + running static and looking at the + full set. + */ + if (bg_conf->slurm_debug_flags + & DEBUG_FLAG_BG_PICK) + info("block %s being free for other " + "job(s), skipping", + bg_record->bg_block_id); + continue; + } else if ((bg_record->job_running != NO_JOB_RUNNING) + && (bg_record->job_running + != job_ptr->job_id)) { + /* Look here if you are trying to run now or + if you aren't looking at the full set. We + don't continue on running blocks for the + full set because we are seeing if the job + can ever run so look here. + */ + if (bg_conf->slurm_debug_flags + & DEBUG_FLAG_BG_PICK) + info("block %s in use by %s job %d", + bg_record->bg_block_id, + bg_record->user_name, + bg_record->job_running); + continue; + } } /* Check processor count */ @@ -626,16 +640,13 @@ static int _check_for_booted_overlapping_blocks( bg_block_id); found_record = bg_record->original; - remove_from_bg_list( - bg_lists->main, - found_record); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("looking for " "original"); found_record = - find_and_remove_org_from_bg_list( + find_org_in_bg_list( bg_lists->main, bg_record); } diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 91af9df1071a52301cdf451e06cbdaa760ff06c9..2bede0ef2012a05cbf4f236e8255b9508c686b97 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -696,7 +696,7 @@ extern int free_block_list(uint32_t job_id, List track_in_list, /* Fake a free since we are n deallocating state before this. */ - if (retry_cnt >= 2) + if (retry_cnt >= 3) bg_record->state = RM_PARTITION_FREE; #endif if ((bg_record->state == RM_PARTITION_FREE) @@ -723,7 +723,7 @@ extern int free_block_list(uint32_t job_id, List track_in_list, list_iterator_reset(itr); while ((bg_record = list_next(itr))) { /* block no longer exists */ - if (bg_record->magic == 0) + if (bg_record->magic != BLOCK_MAGIC) continue; if (bg_record->state != RM_PARTITION_FREE) { restore = true; @@ -1570,7 +1570,7 @@ static int _post_block_free(bg_record_t *bg_record, bool restore) #ifdef HAVE_BG_FILES int rc = SLURM_SUCCESS; #endif - if (bg_record->magic == 0) { + if (bg_record->magic != BLOCK_MAGIC) { error("block already destroyed"); return SLURM_ERROR; } @@ -1603,6 +1603,16 @@ static int _post_block_free(bg_record_t *bg_record, bool restore) return SLURM_SUCCESS; } + /* A bit of a sanity check to make sure blocks are being + removed out of all the lists. + */ + if (blocks_are_created) { + remove_from_bg_list(bg_lists->booted, bg_record); + if (remove_from_bg_list(bg_lists->job_running, bg_record) + == SLURM_SUCCESS) + num_unused_cpus += bg_record->cpu_cnt; + } + if (restore) return SLURM_SUCCESS; @@ -1667,7 +1677,7 @@ static void *_track_freeing_blocks(void *args) /* Fake a free since we are n deallocating state before this. */ - if (retry_cnt >= 2) + if (retry_cnt >= 3) bg_record->state = RM_PARTITION_FREE; #endif if ((bg_record->state == RM_PARTITION_FREE) @@ -1677,7 +1687,7 @@ static void *_track_freeing_blocks(void *args) slurm_mutex_unlock(&block_state_mutex); if (free_cnt == track_cnt) break; - debug("_track_freeing_blocks: freed %d of %d for", + debug("_track_freeing_blocks: freed %d of %d", free_cnt, track_cnt); sleep(FREE_SLEEP_INTERVAL); retry_cnt++;