diff --git a/NEWS b/NEWS index d2b0a8693c7cd436892504e188a472eb07973032..01d8921db90d0243f6a1284d6582c0677346d352 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,8 @@ documents those changes that are of interest to users and admins. -- Fix bug in QoS limits enforcement when slurmctld restarts and user not yet added to the QOS list. -- Fix issue where sjstat and sjobexitmod was installed in 2 different rpms + -- Check all requested partitions to make sure at least one is valid on job + submission. * Changes in SLURM 2.5.1 ======================== diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index d906b9831185b6fcbdfc6c72bb46005f88fb9e27..f0fdc9cd5423d5fcdcffbe2be4daf4fed21c818f 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1421,13 +1421,14 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, int error_code = SLURM_SUCCESS, i, node_set_size = 0; bitstr_t *select_bitmap = NULL; struct node_set *node_set_ptr = NULL; - struct part_record *part_ptr = NULL; + struct part_record *part_ptr = NULL, *orig_part_ptr = NULL; uint32_t min_nodes, max_nodes, req_nodes; enum job_state_reason fail_reason; time_t now = time(NULL); bool configuring = false; List preemptee_job_list = NULL; slurmdb_qos_rec_t *qos_ptr = NULL; + int attempt=1; xassert(job_ptr); xassert(job_ptr->magic == JOB_MAGIC); @@ -1435,12 +1436,12 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, if (!acct_policy_job_runnable(job_ptr)) return ESLURM_ACCOUNTING_POLICY; - part_ptr = job_ptr->part_ptr; + orig_part_ptr = part_ptr = job_ptr->part_ptr; qos_ptr = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; /* identify partition */ if (part_ptr == NULL) { - part_ptr = find_part_record(job_ptr->partition); + orig_part_ptr = part_ptr = find_part_record(job_ptr->partition); xassert(part_ptr); job_ptr->part_ptr = part_ptr; error("partition pointer reset for job %u, part %s", @@ -1467,7 +1468,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, /* build sets of usable nodes based upon their configuration */ error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size); if (error_code) - return error_code; + goto cleanup; /* insure that selected nodes are in these node sets */ if (job_ptr->details->req_node_bitmap) { @@ -1665,6 +1666,28 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, slurm_sched_newalloc(job_ptr); cleanup: + + if (error_code && job_ptr->part_ptr_list + && (attempt < list_count(job_ptr->part_ptr_list))) { + ListIterator part_iterator = list_iterator_create( + job_ptr->part_ptr_list); + int part_cnt = 0; + while ((part_ptr = list_next(part_iterator))) { + if (part_cnt == attempt) + break; + part_cnt++; + } + list_iterator_destroy(part_iterator); + if (part_ptr) { + attempt++; + job_ptr->part_ptr = part_ptr; + debug2("Trying job %u on next requested partition %s", + job_ptr->job_id, part_ptr->name); + goto retry; + } + job_ptr->part_ptr = orig_part_ptr; + } + if (preemptee_job_list) list_destroy(preemptee_job_list); if (select_node_bitmap)