diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index a629aab0990426fbfb2c0408b9f764e88b0968d2..ec1fa9061ffb46b74781cbdc2a0c80547e27f66f 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -81,12 +81,18 @@ static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER; static List pend_job_list = NULL; +/* Backfill scheduling has considerable overhead, + * so only attempt it every BACKFILL_INTERVAL seconds */ +#ifndef BACKFILL_INTERVAL +# define BACKFILL_INTERVAL 10 +#endif + /* Set __DEBUG to get detailed logging for this thread without * detailed logging for the entire slurmctld daemon */ #define __DEBUG 0 + #define MAX_BACKFILL_JOB_CNT 100 #define ONE_DAY (24 * 60 * 60) -#define SLEEP_TIME 2 /*********************** local functions *********************/ static int _add_pending_job(struct job_record *job_ptr, @@ -96,7 +102,6 @@ static void _backfill_part(struct part_record *part_ptr); static void _change_prio(struct job_record *job_ptr, uint32_t prio); static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2, char *tv_str, int len_tv_str); -static bool _has_state_changed(void); static bool _more_work(void); static int _part_prio_sort(void *x, void *y); static int _sort_by_prio(void *x, void *y); @@ -157,6 +162,8 @@ extern void *backfill_agent(void *args) bool filter_root = false; ListIterator part_iterator; struct part_record *part_ptr; + time_t now; + static time_t last_backfill_time = 0; /* Read config, node, and partitions; Write jobs */ slurmctld_lock_t all_locks = { READ_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; @@ -164,10 +171,13 @@ extern void *backfill_agent(void *args) if (slurm_get_root_filter()) filter_root = true; while (!stop_backfill) { - sleep(SLEEP_TIME); /* don't run continuously */ + sleep(1); /* don't run continuously */ - if ((!_more_work()) || (!_has_state_changed()) || stop_backfill) + now = time(NULL); + if ((difftime(now, last_backfill_time) < BACKFILL_INTERVAL) || + stop_backfill || (!_more_work())) continue; + last_backfill_time = now; gettimeofday(&tv1, NULL); lock_slurmctld(all_locks); @@ -212,32 +222,29 @@ extern void run_backfill (void) pthread_mutex_unlock( &thread_flag_mutex ); } -static bool _more_work (void) -{ - static bool rc; - pthread_mutex_lock( &thread_flag_mutex ); - rc = new_work; - new_work = false; - pthread_mutex_unlock( &thread_flag_mutex ); - return rc; -} - /* Report if any changes occurred to job, node or partition information */ -static bool _has_state_changed(void) +static bool _more_work (void) { + bool rc; static time_t backfill_job_time = (time_t) 0; - static time_t backfill_node_time = (time_t) 0; + static time_t backfill_node_time = (time_t) 0; static time_t backfill_part_time = (time_t) 0; + pthread_mutex_lock( &thread_flag_mutex ); if ( (backfill_job_time == last_job_update ) && (backfill_node_time == last_node_update) && - (backfill_part_time == last_part_update) ) - return false; - - backfill_job_time = last_job_update; - backfill_node_time = last_node_update; - backfill_part_time = last_part_update; - return true; + (backfill_part_time == last_part_update) && + (new_work == false) ) { + rc = false; + } else { + backfill_job_time = last_job_update; + backfill_node_time = last_node_update; + backfill_part_time = last_part_update; + new_work = false; + rc = true; + } + pthread_mutex_unlock( &thread_flag_mutex ); + return rc; } /* Attempt to perform backfill scheduling on the specified partition */ @@ -418,6 +425,8 @@ static void _backfill_part(struct part_record *part_ptr) if (i >= node_space_recs) /* job runs too long */ continue; avail_bitmap = bit_copy(node_space[i].avail_bitmap); + if (job_req_node_filter(job_ptr, avail_bitmap)) + continue; if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, diff --git a/src/plugins/sched/wiki2/job_will_run.c b/src/plugins/sched/wiki2/job_will_run.c index 6664c61c9295a196efc326a084d2ab0fb3167cf9..81e2ae93591edc585158eddb2086419301a7e260 100644 --- a/src/plugins/sched/wiki2/job_will_run.c +++ b/src/plugins/sched/wiki2/job_will_run.c @@ -154,7 +154,10 @@ static char * _will_run_test(uint32_t jobid, char *node_list, /* Only consider nodes that are not DOWN or DRAINED */ bit_and(avail_bitmap, avail_node_bitmap); } - + if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) { + /* Job probably has invalid feature list */ + rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; + } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index b8d80756fb84be9c560deed21bc2b336a86175bc..e2cb5b11150cf7897a31ba7bd92c43abbe8a795d 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1143,6 +1143,86 @@ static int _build_feature_list(struct job_record *job_ptr) return SLURM_SUCCESS; } +/* + * job_req_node_filter - job reqeust node filter. + * clear from a bitmap the nodes which can not be used for a job + * test memory size, required features, processor count, etc. + * IN job_ptr - pointer to node to be scheduled + * IN/OUT bitmap - set of nodes being considered for use + * RET SLURM_SUCCESS or EINVAL if can't filter (exclusive OR of features) + */ +extern int job_req_node_filter(struct job_record *job_ptr, + bitstr_t *avail_bitmap) +{ + int i; + struct job_details *detail_ptr = job_ptr->details; + multi_core_data_t *mc_ptr; + struct node_record *node_ptr; + struct config_record *config_ptr; + bitstr_t *feature_bitmap; + + if (detail_ptr == NULL) { + error("job_req_node_filter: job %u has no details", + job_ptr->job_id); + return EINVAL; + } + if (_build_feature_list(job_ptr)) + return EINVAL; + + mc_ptr = detail_ptr->mc_ptr; + for (i=0; i< node_record_count; i++) { + if (!bit_test(avail_bitmap, i)) + continue; + node_ptr = node_record_table_ptr + i; + config_ptr = node_ptr->config_ptr; + feature_bitmap = _valid_features(detail_ptr, config_ptr->feature); + if ((feature_bitmap == NULL) || (!bit_test(feature_bitmap, 0))) { + bit_clear(avail_bitmap, i); + FREE_NULL_BITMAP(feature_bitmap); + continue; + } + FREE_NULL_BITMAP(feature_bitmap); + if (slurmctld_conf.fast_schedule) { + if ((detail_ptr->job_min_procs > config_ptr->cpus ) + || (detail_ptr->job_min_memory > config_ptr->real_memory) + || (detail_ptr->job_max_memory > config_ptr->real_memory) + || (detail_ptr->job_min_tmp_disk > config_ptr->tmp_disk)) { + bit_clear(avail_bitmap, i); + continue; + } + if (mc_ptr + && ((mc_ptr->min_sockets > config_ptr->sockets ) + || (mc_ptr->min_cores > config_ptr->cores ) + || (mc_ptr->min_threads > config_ptr->threads ) + || (mc_ptr->job_min_sockets > config_ptr->sockets ) + || (mc_ptr->job_min_cores > config_ptr->cores ) + || (mc_ptr->job_min_threads > config_ptr->threads ))) { + bit_clear(avail_bitmap, i); + continue; + } + } else { + if ((detail_ptr->job_min_procs > node_ptr->cpus ) + || (detail_ptr->job_min_memory > node_ptr->real_memory) + || (detail_ptr->job_max_memory > node_ptr->real_memory) + || (detail_ptr->job_min_tmp_disk > node_ptr->tmp_disk)) { + bit_clear(avail_bitmap, i); + continue; + } + if (mc_ptr + && ((mc_ptr->min_sockets > node_ptr->sockets ) + || (mc_ptr->min_cores > node_ptr->cores ) + || (mc_ptr->min_threads > node_ptr->threads ) + || (mc_ptr->job_min_sockets > node_ptr->sockets ) + || (mc_ptr->job_min_cores > node_ptr->cores ) + || (mc_ptr->job_min_threads > node_ptr->threads ))) { + bit_clear(avail_bitmap, i); + continue; + } + } + } + return SLURM_SUCCESS; +} + /* * _build_node_list - identify which nodes could be allocated to a job * based upon node features, memory, processors, etc. Note that a diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 920e51b7f761186ffa8ecf9b3f3fc422dabd45d2..76783be10a2f013dd30b6ef5bf34a9b32d98df13 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -897,6 +897,17 @@ extern int job_complete (uint32_t job_id, uid_t uid, bool requeue, */ extern bool job_independent(struct job_record *job_ptr); +/* + * job_req_node_filter - job reqeust node filter. + * clear from a bitmap the nodes which can not be used for a job + * test memory size, required features, processor count, etc. + * IN job_ptr - pointer to node to be scheduled + * IN/OUT bitmap - set of nodes being considered for use + * RET SLURM_SUCCESS or EINVAL if can't filter (exclusive OR of features) + */ +extern int job_req_node_filter(struct job_record *job_ptr, + bitstr_t *avail_bitmap); + /* * job_requeue - Requeue a running or pending batch job * IN uid - user id of user issuing the RPC