diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 9222ab27b2d68ffc6cfc641dada8ee219950e4c1..ff800f5b81b683ac8a569d432cefefc1453eab5b 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -357,7 +357,7 @@ int main(int argc, char *argv[]) } #else if (!_wait_nodes_ready(alloc)) { - if(!allocation_interrupted) + if (!allocation_interrupted) error("Something is wrong with the " "boot of the nodes."); goto relinquish; @@ -1076,6 +1076,8 @@ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) pending_job_id = alloc->job_id; + if (alloc->alias_list && !strcmp(alloc->alias_list, "TBD")) + opt.wait_all_nodes = 1; /* Wait for boot & addresses */ if (opt.wait_all_nodes == (uint16_t) NO_VAL) opt.wait_all_nodes = DEFAULT_WAIT_ALL_NODES; @@ -1110,8 +1112,18 @@ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) break; } if (is_ready) { + resource_allocation_response_msg_t *resp; + char *tmp_str; if (i > 0) - info ("Nodes %s are ready for job", alloc->node_list); + info("Nodes %s are ready for job", alloc->node_list); + if (alloc->alias_list && !strcmp(alloc->alias_list, "TBD") && + (slurm_allocation_lookup_lite(pending_job_id, &resp) + == SLURM_SUCCESS)) { + tmp_str = alloc->alias_list; + alloc->alias_list = resp->alias_list; + resp->alias_list = tmp_str; + slurm_free_resource_allocation_response_msg(resp); + } } else if (!allocation_interrupted) error("Nodes %s are still not ready", alloc->node_list); else /* allocation_interrupted or slurmctld not responing */ diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 48f517f520df7c592a4849a6c21701ab114fc40f..e13c64ef3d54b54d37f14c52c1f3404312b48042 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -1522,7 +1522,7 @@ static int _batch_launch_defer(queued_request_t *queued_req_ptr) batch_job_launch_msg_t *launch_msg_ptr; time_t now = time(NULL); struct job_record *job_ptr; - int delay_time, nodes_ready = 0; + int delay_time, nodes_ready = 0, tmp; agent_arg_ptr = queued_req_ptr->agent_arg_ptr; if (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH) @@ -1544,7 +1544,19 @@ static int _batch_launch_defer(queued_request_t *queued_req_ptr) } if (job_ptr->wait_all_nodes) { - (void) job_node_ready(launch_msg_ptr->job_id, &nodes_ready); + (void) job_node_ready(launch_msg_ptr->job_id, &tmp); + if (tmp == (READY_JOB_STATE | READY_NODE_STATE)) { + nodes_ready = 1; + if (launch_msg_ptr->alias_list && + !strcmp(launch_msg_ptr->alias_list, "TBD")) { + struct job_record *job_ptr; + job_ptr = find_job_record(launch_msg_ptr-> + job_id); + xfree(launch_msg_ptr->alias_list); + launch_msg_ptr->alias_list = xstrdup(job_ptr-> + alias_list); + } + } } else { #ifdef HAVE_FRONT_END nodes_ready = 1; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 4f080efa12c75bb5dfb324fb206e27632daa1a13..51dc7ba718078c62cc08f60533d042ba086fdf4d 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -8530,6 +8530,12 @@ job_alloc_info(uint32_t uid, uint32_t job_id, struct job_record **job_pptr) if (IS_JOB_FINISHED(job_ptr)) return ESLURM_ALREADY_DONE; + if (job_ptr->alias_list && !strcmp(job_ptr->alias_list, "TBD") && + job_ptr->node_bitmap && + (bit_overlap(power_node_bitmap, job_ptr->node_bitmap) == 0)) { + set_job_alias_list(job_ptr); + } + *job_pptr = job_ptr; return SLURM_SUCCESS; } @@ -8992,18 +8998,23 @@ extern int job_node_ready(uint32_t job_id, int *ready) /* Always call select_g_job_ready() so that select/bluegene can * test and update block state information. */ rc = select_g_job_ready(job_ptr); - if (rc == READY_JOB_FATAL) return ESLURM_INVALID_PARTITION_NAME; if (rc == READY_JOB_ERROR) return EAGAIN; - if (rc) rc = READY_NODE_STATE; if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) rc |= READY_JOB_STATE; + if ((rc == (READY_NODE_STATE | READY_JOB_STATE)) && + job_ptr->alias_list && !strcmp(job_ptr->alias_list, "TBD") && + job_ptr->node_bitmap && + (bit_overlap(power_node_bitmap, job_ptr->node_bitmap) == 0)) { + set_job_alias_list(job_ptr); + } + *ready = rc; return SLURM_SUCCESS; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 464ff0375d12b2926054bc7d7b26e16f9ab2d687..a9517f2f1d4220988b25c0d8ffd2785b2ad55dfe 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -114,6 +114,7 @@ static int _pick_best_nodes(struct node_set *node_set_ptr, uint32_t req_nodes, bool test_only, List preemptee_candidates, List *preemptee_job_list); +static void _set_alias_list(struct job_record *job_ptr); static bool _valid_feature_counts(struct job_details *detail_ptr, bitstr_t *node_bitmap, bool *has_xor); static bitstr_t *_valid_features(struct job_details *detail_ptr, @@ -130,6 +131,7 @@ extern void allocate_nodes(struct job_record *job_ptr) { int i; struct node_record *node_ptr; + bool has_cloud = false, has_cloud_power_save = false; #ifdef HAVE_FRONT_END job_ptr->front_end_ptr = assign_front_end(); @@ -138,12 +140,52 @@ extern void allocate_nodes(struct job_record *job_ptr) job_ptr->batch_host = xstrdup(job_ptr->front_end_ptr->name); #endif + for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; + i++, node_ptr++) { + if (!bit_test(job_ptr->node_bitmap, i)) + continue; + + if (IS_NODE_CLOUD(node_ptr)) { + has_cloud = true; + if (IS_NODE_POWER_SAVE(node_ptr)) + has_cloud_power_save = true; + } + make_node_alloc(&node_record_table_ptr[i], job_ptr); + if (job_ptr->batch_host == NULL) + job_ptr->batch_host = xstrdup(node_ptr->name); + } + last_node_update = time(NULL); + license_job_get(job_ptr); + + if (has_cloud) { + if (has_cloud_power_save) { + job_ptr->alias_list = xstrdup("TBD"); + job_ptr->wait_all_nodes = 1; + } else + set_job_alias_list(job_ptr); + } + + return; +} + +/* Set a job's alias_list string */ +extern void set_job_alias_list(struct job_record *job_ptr) +{ + int i; + struct node_record *node_ptr; + xfree(job_ptr->alias_list); for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { if (!bit_test(job_ptr->node_bitmap, i)) continue; + if (IS_NODE_CLOUD(node_ptr)) { + if (IS_NODE_POWER_SAVE(node_ptr)) { + xfree(job_ptr->alias_list); + job_ptr->alias_list = xstrdup("TBD"); + break; + } if (job_ptr->alias_list) xstrcat(job_ptr->alias_list, ","); xstrcat(job_ptr->alias_list, node_ptr->name); @@ -152,17 +194,9 @@ extern void allocate_nodes(struct job_record *job_ptr) xstrcat(job_ptr->alias_list, ":"); xstrcat(job_ptr->alias_list, node_ptr->node_hostname); } - make_node_alloc(&node_record_table_ptr[i], job_ptr); - if (job_ptr->batch_host) - continue; - job_ptr->batch_host = xstrdup(node_ptr->name); } - last_node_update = time(NULL); - license_job_get(job_ptr); - return; } - /* * deallocate_nodes - for a given job, deallocate its nodes and make * their state NODE_STATE_COMPLETING also release the job's licenses diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index e1d707db75b6426cbe819db7490a5e4fc03bfba6..839c3f6b826126da7765650780fc4469350e889d 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1538,6 +1538,9 @@ extern int send_jobs_to_accounting(); */ extern int send_nodes_to_accounting(time_t event_time); +/* Set a job's alias_list string */ +extern void set_job_alias_list(struct job_record *job_ptr); + /* * set_job_prio - set a default job priority * IN job_ptr - pointer to the job_record