From 43281b779ed0ed9cf69fd030c61fa2012e963b10 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 25 Nov 2008 18:18:33 +0000 Subject: [PATCH] Fix job preemption when sched/gang and select/linear are configured with non-sharing partitions. preempt_linear.patch from Chris Holmes --- NEWS | 2 + src/plugins/select/cons_res/select_cons_res.c | 2 +- src/plugins/select/linear/select_linear.c | 71 ++++++++++++++++++- src/slurmctld/node_scheduler.c | 7 ++ 4 files changed, 80 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index e4d75a0dc72..a6fd38f1bc6 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.4.0-pre6 ============================= + -- Fix job preemption when sched/gang and select/linear are configured with + non-sharing partitions. * Changes in SLURM 1.4.0-pre5 ============================= diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 6e453a80736..4bce32878fa 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -228,7 +228,7 @@ extern bool cr_priority_selection_enabled() if (strcmp(sched_type, "sched/gang") == 0) cr_priority_selection = true; xfree(sched_type); - cr_priority_selection = true; + cr_priority_test = true; } return cr_priority_selection; diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 99038cce410..f2992bf12a0 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -135,6 +135,8 @@ static struct node_record *select_node_ptr = NULL; static int select_node_cnt = 0; static uint16_t select_fast_schedule; static uint16_t cr_type; +static bool cr_priority_test = false; +static bool cr_priority_selection = false; static struct node_cr_record *node_cr_ptr = NULL; static pthread_mutex_t cr_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -240,6 +242,19 @@ static int _fini_status_pthread(void) } #endif +static inline bool _cr_priority_selection_enabled(void) +{ + if (!cr_priority_test) { + char *sched_type = slurm_get_sched_type(); + if (strcmp(sched_type, "sched/gang") == 0) + cr_priority_selection = true; + xfree(sched_type); + cr_priority_test = true; + } + return cr_priority_selection; + +} + static bool _enough_nodes(int avail_nodes, int rem_nodes, uint32_t min_nodes, uint32_t req_nodes) { @@ -556,7 +571,7 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, } if (mode != SELECT_MODE_TEST_ONLY) { - if (job_ptr->details->shared == 1) { + if (job_ptr->details->shared) { max_share = job_ptr->part_ptr->max_share & ~SHARED_FORCE; } else /* ((shared == 0) || (shared == (uint16_t) NO_VAL)) */ @@ -575,6 +590,8 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, job_ptr->details->job_min_memory = 0; } + debug3("select/linear: job_test: job %u max_share %d avail nodes %u", + job_ptr->job_id, max_share, bit_set_count(bitmap)); orig_map = bit_copy(bitmap); for (max_run_job=min_share; max_run_job<max_share; max_run_job++) { bool last_iteration = (max_run_job == (max_share -1)); @@ -586,6 +603,8 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, orig_map, bitmap, max_run_job, max_run_job + sus_jobs); + debug3("select/linear: job_test: found %d nodes for %u", + j, job_ptr->job_id); if ((j == prev_cnt) || (j < min_nodes)) continue; prev_cnt = j; @@ -697,12 +716,62 @@ static int _job_count_bitmap(struct node_cr_record *node_cr_ptr, } if ((run_job_cnt != NO_SHARE_LIMIT) && + (!_cr_priority_selection_enabled()) && (node_cr_ptr[i].exclusive_jobid != 0)) { /* already reserved by some exclusive job */ bit_clear(jobmap, i); continue; } + if (_cr_priority_selection_enabled()) { + /* clear this node if any higher-priority + * partitions have existing allocations */ + total_jobs = 0; + part_cr_ptr = node_cr_ptr[i].parts; + for( ;part_cr_ptr; part_cr_ptr = part_cr_ptr->next) { + if (part_cr_ptr->part_ptr->priority <= + job_ptr->part_ptr->priority) + continue; + total_jobs += part_cr_ptr->tot_job_cnt; + } + if ((run_job_cnt != NO_SHARE_LIMIT) && + (total_jobs > 0)) { + bit_clear(jobmap, i); + continue; + } + /* if not sharing, then check with other partitions + * of equal priority. Otherwise, load-balance within + * the local partition */ + total_jobs = 0; + total_run_jobs = 0; + part_cr_ptr = node_cr_ptr[i].parts; + for( ; part_cr_ptr; part_cr_ptr = part_cr_ptr->next) { + if (part_cr_ptr->part_ptr->priority != + job_ptr->part_ptr->priority) + continue; + if (!job_ptr->details->shared) { + total_run_jobs += + part_cr_ptr->run_job_cnt; + total_jobs += part_cr_ptr->tot_job_cnt; + continue; + } + if (part_cr_ptr->part_ptr == job_ptr->part_ptr){ + total_run_jobs += + part_cr_ptr->run_job_cnt; + total_jobs += part_cr_ptr->tot_job_cnt; + break; + } + } + if ((total_run_jobs <= run_job_cnt) && + (total_jobs <= tot_job_cnt)) { + bit_set(jobmap, i); + count++; + } else { + bit_clear(jobmap, i); + } + continue; + } + total_jobs = 0; total_run_jobs = 0; part_cr_ptr = node_cr_ptr[i].parts; diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 51260782bcc..1997099601c 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -278,6 +278,13 @@ static int _match_feature(char *seek, struct node_set *node_set_ptr) * (uint16_t)NO_VAL = default * 0 = exclusive * 1 = share=yes + * + * Return values: + * 0 = no sharing + * 1 = user requested sharing + * 2 = sharing enforced (either by partition or cons_res) + * (cons_res plugin needs to distinguish between "enforced" and + * "requested" sharing) */ static int _resolve_shared_status(uint16_t user_flag, uint16_t part_max_share, -- GitLab