From 0fc4539e7a0e23f1eaab5c692ce33418c9f7768e Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Thu, 4 Apr 2013 12:24:13 -0700 Subject: [PATCH] Prevent assert failure in scheduling logic This fixes a bug introduced in commit f1cf6d2df1abbcf2703e5ba81bc4792487a691d8 Without this change, a job with a partition related reason for not running (e.g. MaxNodes) plus some other reason (e.g. dependency), could after satisfying the second reason (e.g. dependency), have the select_nodes() function executed and return SUCCESS on an allocation request, but it would not actually be allocation resources since select_nodes() would interpret the request as a test to see if it could ever run (e.g. will_run). Bug 263 --- src/slurmctld/job_mgr.c | 12 ++++-------- src/slurmctld/job_scheduler.c | 15 ++++++++++----- src/slurmctld/node_scheduler.c | 14 +------------- 3 files changed, 15 insertions(+), 26 deletions(-) diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index c79055a2077..b9f21108199 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2727,6 +2727,8 @@ static int _select_nodes_parts(struct job_record *job_ptr, bool test_only, } list_iterator_destroy(iter); } else { + if (job_limits_check(&job_ptr) != WAIT_NO_REASON) + test_only = true; rc = select_nodes(job_ptr, test_only, select_node_bitmap); } @@ -2851,14 +2853,8 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, test_only = will_run || (allocate == 0); no_alloc = test_only || too_fragmented || - (!top_prio) || (!independent); - if (!no_alloc && !avail_front_end()) { - debug("sched: job_allocate() returning, no front end nodes " - "are available"); - error_code = ESLURM_NODES_BUSY; - } else { - error_code = _select_nodes_parts(job_ptr, no_alloc, NULL); - } + (!top_prio) || (!independent) || !avail_front_end(); + error_code = _select_nodes_parts(job_ptr, no_alloc, NULL); if (!test_only) { last_job_update = now; slurm_sched_schedule(); /* work for external scheduler */ diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index cf9c95d2cf4..4052a61f9de 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -200,13 +200,16 @@ static bool _job_runnable_test1(struct job_record *job_ptr, bool clear_start) /* Job and partition tests for ability to run now */ static bool _job_runnable_test2(struct job_record *job_ptr) { - if (!part_policy_job_runnable_state(job_ptr)) { - job_ptr->state_reason = job_limits_check(&job_ptr); - xfree(job_ptr->state_desc); + int reason; - if (job_ptr->state_reason != WAIT_NO_REASON) - return false; + reason = job_limits_check(&job_ptr); + if ((reason != job_ptr->state_reason) && + (part_policy_job_runnable_state(job_ptr))) { + job_ptr->state_reason = reason; + xfree(job_ptr->state_desc); } + if (reason != WAIT_NO_REASON) + return false; return true; } @@ -498,6 +501,8 @@ next_part: part_ptr = (struct part_record *) continue; } } + if (job_limits_check(&job_ptr) != WAIT_NO_REASON) + continue; /* Test for valid account, QOS and required nodes on each pass */ if (job_ptr->state_reason == FAIL_ACCOUNT) { diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index f7135bdcdb1..dec0a15982c 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1426,7 +1426,6 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, struct node_set *node_set_ptr = NULL; struct part_record *part_ptr = NULL; uint32_t min_nodes, max_nodes, req_nodes; - enum job_state_reason fail_reason; time_t now = time(NULL); bool configuring = false; List preemptee_job_list = NULL; @@ -1458,13 +1457,6 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, return ESLURM_JOB_HELD; } - /* Confirm that partition is up and has compatible nodes limits. - * If not, we still continue to determine if the job can ever run. - * For example, the geometry might be incompatible with BlueGene. */ - fail_reason = job_limits_check(&job_ptr); - if (fail_reason != WAIT_NO_REASON) - test_only = true; - /* build sets of usable nodes based upon their configuration */ error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size); if (error_code) @@ -1552,7 +1544,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, } } } - if (error_code || fail_reason) { + if (error_code) { /* Fatal errors for job here */ if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) { /* Too many nodes requested */ @@ -1561,10 +1553,6 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, job_ptr->state_reason = WAIT_PART_NODE_LIMIT; xfree(job_ptr->state_desc); last_job_update = now; - } else if (fail_reason) { - job_ptr->state_reason = fail_reason; - xfree(job_ptr->state_desc); - last_job_update = now; /* Non-fatal errors for job below */ } else if (error_code == ESLURM_NODE_NOT_AVAIL) { -- GitLab