diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 0e324c710c8b5591766ea33dbf526cdd7dc1cc97..01494f01927dbc622240fea80af3c589f56f2389 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -508,17 +508,6 @@ static int _attempt_backfill(void) if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill test for job %u", job_ptr->job_id); - if (!acct_policy_job_runnable_state(job_ptr) || - !acct_policy_job_runnable(job_ptr)) { - debug2("backfill: job %u is not allowed to run now. " - "Skipping it. State=%s. Reason=%s. Priority=%u", - job_ptr->job_id, - job_state_string(job_ptr->job_state), - job_reason_string(job_ptr->state_reason), - job_ptr->priority); - continue; - } - if (((part_ptr->state_up & PARTITION_SCHED) == 0) || (part_ptr->node_bitmap == NULL)) continue; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 51dc7ba718078c62cc08f60533d042ba086fdf4d..dd8f66d06333678b241044a5d4144edd75820fae 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1638,7 +1638,6 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) info("Holding job %u with invalid association", job_id); xfree(job_ptr->state_desc); job_ptr->state_reason = FAIL_ACCOUNT; - job_ptr->priority = 1; /* Move to end of queue */ } else { job_ptr->assoc_id = assoc_rec.id; info("Recovered job %u %u", job_id, job_ptr->assoc_id); @@ -1662,7 +1661,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) } if (!job_finished && job_ptr->qos_id && - job_ptr->state_reason != FAIL_ACCOUNT) { + (job_ptr->state_reason != FAIL_ACCOUNT)) { memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t)); qos_rec.id = job_ptr->qos_id; job_ptr->qos_ptr = _determine_and_validate_qos( @@ -1671,7 +1670,6 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) info("Holding job %u with invalid qos", job_id); xfree(job_ptr->state_desc); job_ptr->state_reason = FAIL_QOS; - job_ptr->priority = 1; /* Move to end of queue */ } job_ptr->qos_id = qos_rec.id; } @@ -4176,7 +4174,6 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, error_code = ESLURM_QOS_THRES; else error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; - job_ptr->priority = 1; /* Move to end of queue */ job_ptr->state_reason = fail_reason; xfree(job_ptr->state_desc); } @@ -4994,14 +4991,6 @@ void job_time_limit(void) * running, suspended and pending job */ resv_status = job_resv_check(job_ptr); - if ((job_ptr->priority == 1) && (!IS_JOB_FINISHED(job_ptr))) { - /* Rather than resetting job priorities whenever a - * DOWN, DRAINED or non-responsive node is returned to - * service, we pick them up here. There will be a small - * delay in restting a job's priority, but the code is - * a lot cleaner this way. */ - set_job_prio(job_ptr); - } if (!IS_JOB_RUNNING(job_ptr)) continue; @@ -6456,27 +6445,6 @@ extern void sync_job_priorities(void) lowest_prio += prio_boost; } - -/* After a node is returned to service, reset the priority of jobs - * which may have been held due to that node being unavailable */ -extern void reset_job_priority(void) -{ - ListIterator job_iterator; - struct job_record *job_ptr; - int count = 0; - - job_iterator = list_iterator_create(job_list); - while ((job_ptr = (struct job_record *) list_next(job_iterator))) { - if ((job_ptr->priority == 1) && (!IS_JOB_FINISHED(job_ptr))) { - set_job_prio(job_ptr); - count++; - } - } - list_iterator_destroy(job_iterator); - if (count) - last_job_update = time(NULL); -} - /* * _top_priority - determine if any other job has a higher priority than the * specified job @@ -6497,7 +6465,7 @@ static bool _top_priority(struct job_record *job_ptr) * execute on different sets of nodes. While sched/backfill would * eventually start the job if delayed here based upon priority, * that could delay the initiation of a job by a few seconds. */ - if(static_part == (uint16_t)NO_VAL) { + if (static_part == (uint16_t)NO_VAL) { /* Since this never changes we can just set it once and not look at it again. */ rc = select_g_get_info_from_plugin(SELECT_STATIC_PART, job_ptr, @@ -6527,6 +6495,8 @@ static bool _top_priority(struct job_record *job_ptr) continue; } if (!acct_policy_job_runnable_state(job_ptr2) || + !misc_policy_job_runnable_state(job_ptr2) || + !part_policy_job_runnable_state(job_ptr2) || !job_independent(job_ptr2, 0)) continue; if ((job_ptr2->resv_name && (!job_ptr->resv_name)) || @@ -6570,7 +6540,7 @@ static bool _top_priority(struct job_record *job_ptr) job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); } - } else if (job_ptr->priority != 1) { /* not system hold */ + } else if (job_ptr->state_reason == WAIT_NO_REASON) { job_ptr->state_reason = WAIT_PRIORITY; xfree(job_ptr->state_desc); } @@ -7864,7 +7834,6 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) error_code = SLURM_SUCCESS; else error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; - job_ptr->priority = 1; /* Move to end of queue */ job_ptr->state_reason = fail_reason; xfree(job_ptr->state_desc); return error_code; @@ -9563,7 +9532,6 @@ extern int job_hold_by_assoc_id(uint32_t assoc_id) job_ptr->job_id); xfree(job_ptr->state_desc); job_ptr->state_reason = FAIL_ACCOUNT; - job_ptr->priority = 1; /* Move to end of queue */ cnt++; } list_iterator_destroy(job_iterator); @@ -9615,7 +9583,6 @@ extern int job_hold_by_qos_id(uint32_t qos_id) info("QOS deleted, holding job %u", job_ptr->job_id); xfree(job_ptr->state_desc); job_ptr->state_reason = FAIL_QOS; - job_ptr->priority = 1; /* Move to end of queue */ cnt++; } list_iterator_destroy(job_iterator); @@ -9770,7 +9737,7 @@ extern int send_jobs_to_accounting(void) lock_slurmctld(job_write_lock); itr = list_iterator_create(job_list); while ((job_ptr = list_next(itr))) { - if(!job_ptr->assoc_id) { + if (!job_ptr->assoc_id) { slurmdb_association_rec_t assoc_rec; memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t)); @@ -9790,7 +9757,6 @@ extern int send_jobs_to_accounting(void) job_ptr->job_id); xfree(job_ptr->state_desc); job_ptr->state_reason = FAIL_ACCOUNT; - job_ptr->priority = 1; /* Move to end of queue */ continue; } else job_ptr->assoc_id = assoc_rec.id; diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index fbebcdf56350ed145280a190f9dc134d063ac1f0..2b72878599a06ed7b2482229e25dc9077f28f7da 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -183,7 +183,7 @@ extern List build_job_queue(bool clear_start) job_reason_string(job_ptr->state_reason), job_ptr->priority); continue; - } else if ((job_ptr->priority == 1) && !job_indepen && + } else if (!job_indepen && ((job_ptr->state_reason == WAIT_HELD) || (job_ptr->state_reason == WAIT_HELD_USER))) { /* released behind active dependency? */ @@ -200,6 +200,11 @@ extern List build_job_queue(bool clear_start) fatal("list_iterator_create malloc failure"); while ((part_ptr = (struct part_record *) list_next(part_iterator))) { + job_ptr->part_ptr = part_ptr; + if ((job_limits_check(&job_ptr) != + WAIT_NO_REASON) || + !acct_policy_job_runnable(job_ptr)) + continue; _job_queue_append(job_queue, job_ptr, part_ptr); } list_iterator_destroy(part_iterator); @@ -217,6 +222,18 @@ extern List build_job_queue(bool clear_start) "part %s", job_ptr->job_id, job_ptr->partition); } + if (!part_policy_job_runnable_state(job_ptr)) { + if (job_limits_check(&job_ptr) == + WAIT_NO_REASON) { + job_ptr->state_reason = WAIT_NO_REASON; + xfree(job_ptr->state_desc); + } else { + continue; + } + } + if (!acct_policy_job_runnable_state(job_ptr) || + !acct_policy_job_runnable(job_ptr)) + continue; _job_queue_append(job_queue, job_ptr, job_ptr->part_ptr); } @@ -429,6 +446,8 @@ extern int schedule(uint32_t job_limit) while ((job_queue_rec = list_pop_bottom(job_queue, sort_job_queue2))) { job_ptr = job_queue_rec->job_ptr; part_ptr = job_queue_rec->part_ptr; + /* Cycle through partitions usable for this job */ + job_ptr->part_ptr = part_ptr; xfree(job_queue_rec); if ((time(NULL) - sched_start) >= sched_timeout) { debug("sched: loop taking too long, breaking out"); @@ -451,20 +470,7 @@ extern int schedule(uint32_t job_limit) continue; } - /* If a patition update has occurred, then do a limit check. */ - if (save_last_part_update != last_part_update) { - int fail_reason = job_limits_check(&job_ptr); - if (fail_reason != WAIT_NO_REASON) { - job_ptr->state_reason = fail_reason; - job_ptr->priority = 1; - continue; - } - } else if ((job_ptr->state_reason == WAIT_PART_TIME_LIMIT) || - (job_ptr->state_reason == WAIT_PART_NODE_LIMIT)) { - job_ptr->start_time = 0; - job_ptr->priority = 1; - continue; - } + /* Test for valid account, QOS and required nodes on each pass */ if (job_ptr->state_reason == FAIL_ACCOUNT) { slurmdb_association_rec_t assoc_rec; memset(&assoc_rec, 0, sizeof(slurmdb_association_rec_t)); @@ -478,8 +484,9 @@ extern int schedule(uint32_t job_limit) &job_ptr->assoc_ptr)) { job_ptr->state_reason = WAIT_NO_REASON; job_ptr->assoc_id = assoc_rec.id; - } else + } else { continue; + } } if (job_ptr->qos_id) { slurmdb_association_rec_t *assoc_ptr; @@ -491,18 +498,23 @@ extern int schedule(uint32_t job_limit) job_ptr->job_id); xfree(job_ptr->state_desc); job_ptr->state_reason = FAIL_QOS; - job_ptr->priority = 1; /* Move to end of queue */ continue; + } else if (job_ptr->state_reason == FAIL_QOS) { + xfree(job_ptr->state_desc); + job_ptr->state_reason = WAIT_NO_REASON; } } - if (job_ptr->part_ptr != part_ptr) { - /* Cycle through partitions usable for this job */ - job_ptr->part_ptr = part_ptr; + if ((job_ptr->state_reason == WAIT_NODE_NOT_AVAIL) && + job_ptr->details && job_ptr->details->req_node_bitmap && + !bit_super_set(job_ptr->details->req_node_bitmap, + avail_node_bitmap)) { + continue; } + if ((job_ptr->resv_name == NULL) && _failed_partition(job_ptr->part_ptr, failed_parts, failed_part_cnt)) { - if (job_ptr->priority != 1) { /* not system hold */ + if (job_ptr->state_reason == WAIT_NO_REASON) { job_ptr->state_reason = WAIT_PRIORITY; xfree(job_ptr->state_desc); } @@ -553,7 +565,6 @@ extern int schedule(uint32_t job_limit) last_job_update = time(NULL); job_ptr->state_reason = FAIL_ACCOUNT; xfree(job_ptr->state_desc); - job_ptr->priority = 1; /* Move to end of queue */ continue; } diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 75fe6b12a3cd2d20b9c85cf1cdbb5a4f67c700a6..2c0ea27a065cbd22c3865ce7e661b18f9c0f0fe9 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1092,7 +1092,6 @@ int update_node ( update_node_msg_t * update_node_msg ) bit_set (idle_node_bitmap, node_inx); bit_set (up_node_bitmap, node_inx); node_ptr->last_idle = now; - reset_job_priority(); } else if (state_val == NODE_STATE_ALLOCATED) { if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr) && @@ -1794,7 +1793,6 @@ extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg) reg_msg->os = NULL; /* Nothing left to free */ if (IS_NODE_NO_RESPOND(node_ptr)) { - reset_job_priority(); node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); node_ptr->node_state &= (~NODE_STATE_POWER_UP); last_node_update = time (NULL); @@ -1816,7 +1814,6 @@ extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg) } } else { if (IS_NODE_UNKNOWN(node_ptr) || IS_NODE_FUTURE(node_ptr)) { - reset_job_priority(); debug("validate_node_specs: node %s registered with " "%u jobs", reg_msg->node_name,reg_msg->job_count); @@ -1857,7 +1854,6 @@ extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg) } info("node %s returned to service", reg_msg->node_name); - reset_job_priority(); trigger_node_up(node_ptr); last_node_update = now; if (!IS_NODE_DRAIN(node_ptr) @@ -2244,10 +2240,8 @@ extern int validate_nodes_via_front_end( hostlist_destroy(reg_hostlist); } - if (update_node_state) { - reset_job_priority(); + if (update_node_state) last_node_update = time (NULL); - } return error_code; } @@ -2319,7 +2313,6 @@ static void _node_did_resp(struct node_record *node_ptr) node_ptr->last_response = now; if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_POWER_UP(node_ptr)) { info("Node %s now responding", node_ptr->name); - reset_job_priority(); node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); node_ptr->node_state &= (~NODE_STATE_POWER_UP); if (!is_node_in_maint_reservation(node_inx)) diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 602c86525a3ea7a8a38b6f1b2e6df69ebcdb1cb3..82d19145ad354f68237cb20ab5847cfdb69e13bb 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1247,7 +1247,6 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, return ESLURM_JOB_HELD; } job_ptr->state_reason = fail_reason; - job_ptr->priority = 1; /* sys hold, move to end of queue */ return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; } @@ -1340,8 +1339,6 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, job_ptr->job_id); job_ptr->state_reason = WAIT_PART_NODE_LIMIT; xfree(job_ptr->state_desc); - if (job_ptr->priority != 0) /* Move to end of queue */ - job_ptr->priority = 1; last_job_update = now; } else if (error_code == ESLURM_NODE_NOT_AVAIL) { /* Required nodes are down or drained */ @@ -1349,8 +1346,6 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, job_ptr->job_id); job_ptr->state_reason = WAIT_NODE_NOT_AVAIL; xfree(job_ptr->state_desc); - if (job_ptr->priority != 0) /* Move to end of queue */ - job_ptr->priority = 1; last_job_update = now; } else if (error_code == ESLURM_RESERVATION_NOT_USABLE) { job_ptr->state_reason = WAIT_RESERVATION; diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c index a5829f9cf92e48f391e5864e92733f63e0fb7b04..a37b399b648860f10b7b4ca027e127e9530a03f9 100644 --- a/src/slurmctld/partition_mgr.c +++ b/src/slurmctld/partition_mgr.c @@ -1375,7 +1375,6 @@ extern int update_part (update_part_msg_t * part_desc, bool create_flag) if (error_code == SLURM_SUCCESS) { slurm_sched_partition_change(); /* notify sched plugin */ select_g_reconfigure(); /* notify select plugin too */ - reset_job_priority(); /* free jobs */ } return error_code; @@ -1577,7 +1576,40 @@ extern int delete_partition(delete_part_msg_t *part_desc_ptr) slurm_sched_partition_change(); /* notify sched plugin */ select_g_reconfigure(); /* notify select plugin too */ - reset_job_priority(); /* free jobs */ return SLURM_SUCCESS; } + +/* + * Determine of the specified job can execute right now or is currently + * blocked by a miscellaneous limit. This does not re-validate job state, + * but relies upon schedule() in src/slurmctld/job_scheduler.c to do so. + */ +extern bool misc_policy_job_runnable_state(struct job_record *job_ptr) +{ + if ((job_ptr->state_reason == FAIL_ACCOUNT) || + (job_ptr->state_reason == FAIL_QOS) || + (job_ptr->state_reason == WAIT_NODE_NOT_AVAIL)) { + return false; + } + + return true; +} + +/* + * Determine of the specified job can execute right now or is currently + * blocked by a partition state or limit. Execute job_limits_check() to + * re-validate job state. + */ +extern bool part_policy_job_runnable_state(struct job_record *job_ptr) +{ + if ((job_ptr->state_reason == WAIT_PART_DOWN) || + (job_ptr->state_reason == WAIT_PART_INACTIVE) || + (job_ptr->state_reason == WAIT_PART_NODE_LIMIT) || + (job_ptr->state_reason == WAIT_PART_TIME_LIMIT) || + (job_ptr->state_reason == WAIT_QOS_THRES)) { + return false; + } + + return true; +} diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 839c3f6b826126da7765650780fc4469350e889d..6305143ee9fca8509c98f5bf14956e4870dbd361 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1316,6 +1316,13 @@ extern void make_node_comp(struct node_record *node_ptr, extern void make_node_idle(struct node_record *node_ptr, struct job_record *job_ptr); +/* + * Determine of the specified job can execute right now or is currently + * blocked by a miscellaneous limit. This does not re-validate job state, + * but relies upon schedule() in src/slurmctld/job_scheduler.c to do so. + */ +extern bool misc_policy_job_runnable_state(struct job_record *job_ptr); + /* msg_to_slurmd - send given msg_type every slurmd, no args */ extern void msg_to_slurmd (slurm_msg_type_t msg_type); @@ -1456,6 +1463,13 @@ extern void part_filter_set(uid_t uid); /* part_fini - free all memory associated with partition records */ extern void part_fini (void); +/* + * Determine of the specified job can execute right now or is currently + * blocked by a partition state or limit. Execute job_limits_check() to + * re-validate job state. + */ +extern bool part_policy_job_runnable_state(struct job_record *job_ptr); + /* * partition_in_use - determine whether a partition is in use by a RUNNING * PENDING or SUSPENDED job @@ -1499,10 +1513,6 @@ extern void reset_first_job_id(void); */ extern void reset_job_bitmaps (void); -/* After a node is returned to service, reset the priority of jobs - * which may have been held due to that node being unavailable */ -extern void reset_job_priority(void); - /* * restore_node_features - Make node and config (from slurm.conf) fields * consistent for Features, Gres and Weight