diff --git a/doc/html/elastic_computing.shtml b/doc/html/elastic_computing.shtml index 6e90de7ed4747cf4ff260c99e58c58b603949a41..2fa6789fce0a4179673704f2ea277b48ea83485e 100644 --- a/doc/html/elastic_computing.shtml +++ b/doc/html/elastic_computing.shtml @@ -126,10 +126,18 @@ option is used.</li> <p>The <i>SuspendProgram</i> only needs to relinquish the node back to the cloud.</p> +<p>An environment variable SLURM_NODE_ALIASES is set with the node name and +communication address pairs. The variable is set by salloc, sbatch, and srun. +It is then used by srun to determine the destination for job launch +communication messages.</p> + <h2>Remaining Work</h2> <ul> -<li>srun commands are not currently aware of the addresses of nodes in the cloud.</li> +<li>The sbatch logicneeds modifcation to set the SLURM_NODE_ALIASES +environment variable.</li> +<li>The SLURM_NODE_ALIASES environment varilable needs to change if jobs +size changes.</li> <li>We need scripts to provision resources from EC2.</li> </ul> diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 5ca6e5d4c57c7162ec03e6b6416a4edc9a35cf0b..73f95817a997d49bda3f4e41cb9f1904612727e2 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1501,6 +1501,7 @@ typedef struct delete_partition_msg { typedef struct resource_allocation_response_msg { uint32_t job_id; /* assigned job id */ char *node_list; /* assigned list of nodes */ + char *alias_list; /* node name/address aliases */ uint32_t num_cpu_groups;/* elements in below cpu arrays */ uint16_t *cpus_per_node;/* cpus per node */ uint32_t *cpu_count_reps;/* how many nodes have same cpu count */ diff --git a/src/common/env.c b/src/common/env.c index 55872f98542cdfca6d732850e68b9f0e5f69e134..51f56a17e0858f489077740ed5be21f25ca1b3e5 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -943,17 +943,19 @@ env_array_for_job(char ***dest, const resource_allocation_response_msg_t *alloc, env_array_overwrite_fmt(dest, "SLURM_JOB_NUM_NODES", "%u", node_cnt); env_array_overwrite_fmt(dest, "SLURM_JOB_NODELIST", "%s", alloc->node_list); + env_array_overwrite_fmt(dest, "SLURM_NODE_ALIASES", "%s", + alloc->alias_list); _set_distribution(desc->task_dist, &dist, &lllp_dist); - if(dist) + if (dist) env_array_overwrite_fmt(dest, "SLURM_DISTRIBUTION", "%s", dist); - if(desc->task_dist == SLURM_DIST_PLANE) + if (desc->task_dist == SLURM_DIST_PLANE) env_array_overwrite_fmt(dest, "SLURM_DIST_PLANESIZE", "%u", desc->plane_size); - if(lllp_dist) + if (lllp_dist) env_array_overwrite_fmt(dest, "SLURM_DIST_LLLP", "%s", lllp_dist); diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 84294549ca05d2ffee31e6200532affcc4828184..9304a531e6429f405239768afff058b555507220 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1794,6 +1794,7 @@ extern void slurm_free_resource_allocation_response_msg ( if (msg) { select_g_select_jobinfo_free(msg->select_jobinfo); msg->select_jobinfo = NULL; + xfree(msg->alias_list); xfree(msg->node_list); xfree(msg->cpus_per_node); xfree(msg->cpu_count_reps); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 872e7c82c3b8e70f04d95b4cecb019e9a25fb22f..25dea356911df687dbd7e6fbad07f371bbf6ef3c 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2465,6 +2465,7 @@ _pack_resource_allocation_response_msg(resource_allocation_response_msg_t *msg, pack32(msg->error_code, buffer); pack32(msg->job_id, buffer); pack32(msg->pn_min_memory, buffer); + packstr(msg->alias_list, buffer); packstr(msg->node_list, buffer); pack32(msg->num_cpu_groups, buffer); @@ -2493,31 +2494,61 @@ _unpack_resource_allocation_response_msg( *msg = tmp_ptr; /* load the data values */ - safe_unpack32(&tmp_ptr->error_code, buffer); - safe_unpack32(&tmp_ptr->job_id, buffer); - safe_unpack32(&tmp_ptr->pn_min_memory, buffer); - safe_unpackstr_xmalloc(&tmp_ptr->node_list, &uint32_tmp, buffer); + if (protocol_version >= SLURM_2_4_PROTOCOL_VERSION) { + safe_unpack32(&tmp_ptr->error_code, buffer); + safe_unpack32(&tmp_ptr->job_id, buffer); + safe_unpack32(&tmp_ptr->pn_min_memory, buffer); + safe_unpackstr_xmalloc(&tmp_ptr->alias_list, &uint32_tmp, + buffer); + safe_unpackstr_xmalloc(&tmp_ptr->node_list, &uint32_tmp, + buffer); - safe_unpack32(&tmp_ptr->num_cpu_groups, buffer); - if (tmp_ptr->num_cpu_groups > 0) { - safe_unpack16_array(&tmp_ptr->cpus_per_node, &uint32_tmp, - buffer); - if (tmp_ptr->num_cpu_groups != uint32_tmp) - goto unpack_error; - safe_unpack32_array(&tmp_ptr->cpu_count_reps, &uint32_tmp, - buffer); - if (tmp_ptr->num_cpu_groups != uint32_tmp) + safe_unpack32(&tmp_ptr->num_cpu_groups, buffer); + if (tmp_ptr->num_cpu_groups > 0) { + safe_unpack16_array(&tmp_ptr->cpus_per_node, + &uint32_tmp, buffer); + if (tmp_ptr->num_cpu_groups != uint32_tmp) + goto unpack_error; + safe_unpack32_array(&tmp_ptr->cpu_count_reps, + &uint32_tmp, buffer); + if (tmp_ptr->num_cpu_groups != uint32_tmp) + goto unpack_error; + } else { + tmp_ptr->cpus_per_node = NULL; + tmp_ptr->cpu_count_reps = NULL; + } + + safe_unpack32(&tmp_ptr->node_cnt, buffer); + if (select_g_select_jobinfo_unpack(&tmp_ptr->select_jobinfo, + buffer, protocol_version)) goto unpack_error; } else { - tmp_ptr->cpus_per_node = NULL; - tmp_ptr->cpu_count_reps = NULL; - } + safe_unpack32(&tmp_ptr->error_code, buffer); + safe_unpack32(&tmp_ptr->job_id, buffer); + safe_unpack32(&tmp_ptr->pn_min_memory, buffer); + safe_unpackstr_xmalloc(&tmp_ptr->node_list, &uint32_tmp, + buffer); - safe_unpack32(&tmp_ptr->node_cnt, buffer); + safe_unpack32(&tmp_ptr->num_cpu_groups, buffer); + if (tmp_ptr->num_cpu_groups > 0) { + safe_unpack16_array(&tmp_ptr->cpus_per_node, + &uint32_tmp, buffer); + if (tmp_ptr->num_cpu_groups != uint32_tmp) + goto unpack_error; + safe_unpack32_array(&tmp_ptr->cpu_count_reps, + &uint32_tmp, buffer); + if (tmp_ptr->num_cpu_groups != uint32_tmp) + goto unpack_error; + } else { + tmp_ptr->cpus_per_node = NULL; + tmp_ptr->cpu_count_reps = NULL; + } - if (select_g_select_jobinfo_unpack(&tmp_ptr->select_jobinfo, buffer, - protocol_version)) - goto unpack_error; + safe_unpack32(&tmp_ptr->node_cnt, buffer); + if (select_g_select_jobinfo_unpack(&tmp_ptr->select_jobinfo, + buffer, protocol_version)) + goto unpack_error; + } return SLURM_SUCCESS; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 1e7d408b62c2548845c49e1231d2eb491c84c1dc..4f080efa12c75bb5dfb324fb206e27632daa1a13 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -5378,6 +5378,7 @@ static void _list_delete_job(void *job_entry) delete_job_details(job_ptr); xfree(job_ptr->account); + xfree(job_ptr->alias_list); xfree(job_ptr->alloc_node); xfree(job_ptr->batch_host); xfree(job_ptr->comment); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 1d278e4d98376e76c93139fc32ade6e77e3bd00e..5ec80a4b01617b53cf4d481a682105250c7b7682 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -129,6 +129,7 @@ static bitstr_t *_valid_features(struct job_details *detail_ptr, extern void allocate_nodes(struct job_record *job_ptr) { int i; + struct node_record *node_ptr; #ifdef HAVE_FRONT_END job_ptr->front_end_ptr = assign_front_end(); @@ -137,13 +138,22 @@ extern void allocate_nodes(struct job_record *job_ptr) job_ptr->batch_host = xstrdup(job_ptr->front_end_ptr->name); #endif - for (i = 0; i < node_record_count; i++) { + xfree(job_ptr->alias_list); + for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; + i++, node_ptr++) { if (!bit_test(job_ptr->node_bitmap, i)) continue; + if (IS_NODE_CLOUD(node_ptr)) { + if (job_ptr->alias_list) + xstrcat(job_ptr->alias_list, ","); + xstrcat(job_ptr->alias_list, node_ptr->name); + xstrcat(job_ptr->alias_list, ":"); + xstrcat(job_ptr->alias_list, node_ptr->comm_name); + } make_node_alloc(&node_record_table_ptr[i], job_ptr); if (job_ptr->batch_host) continue; - job_ptr->batch_host = xstrdup(node_record_table_ptr[i].name); + job_ptr->batch_host = xstrdup(node_ptr->name); } last_node_update = time(NULL); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index ba6958cfbd3008955f33d7f584f65a4b0531559c..463ca91fe8f23271051154fe3c91f6a618a0e20e 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -858,6 +858,7 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) alloc_msg.job_id = job_ptr->job_id; alloc_msg.node_cnt = job_ptr->node_cnt; alloc_msg.node_list = xstrdup(job_ptr->nodes); + alloc_msg.alias_list = xstrdup(job_ptr->alias_list); alloc_msg.select_jobinfo = select_g_select_jobinfo_copy(job_ptr->select_jobinfo); if (job_ptr->details) { @@ -2074,6 +2075,7 @@ static void _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg) job_info_resp_msg.job_id = job_info_msg->job_id; job_info_resp_msg.node_cnt = job_ptr->node_cnt; job_info_resp_msg.node_list = xstrdup(job_ptr->nodes); + job_info_resp_msg.alias_list = xstrdup(job_ptr->alias_list); job_info_resp_msg.select_jobinfo = select_g_select_jobinfo_copy(job_ptr->select_jobinfo); unlock_slurmctld(job_read_lock); @@ -2125,7 +2127,8 @@ static void _slurm_rpc_job_sbcast_cred(slurm_msg_t * msg) slurm_strerror(error_code)); slurm_send_rc_msg(msg, error_code); } else if ((sbcast_cred = create_sbcast_cred(slurmctld_config.cred_ctx, - job_ptr->job_id, job_ptr->nodes)) == NULL) { + job_ptr->job_id, + job_ptr->nodes)) == NULL){ unlock_slurmctld(job_read_lock); error("_slurm_rpc_job_sbcast_cred JobId=%u cred create error", job_info_msg->job_id); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 98d7651cd1d629587e8b06e7fd9495738dc2b58a..e1d707db75b6426cbe819db7490a5e4fc03bfba6 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -422,6 +422,7 @@ struct job_details { struct job_record { char *account; /* account number to charge */ + char *alias_list; /* node name to address aliases */ char *alloc_node; /* local node making resource alloc */ uint16_t alloc_resp_port; /* RESPONSE_RESOURCE_ALLOCATION port */ uint32_t alloc_sid; /* local sid making resource alloc */ diff --git a/src/slurmctld/srun_comm.c b/src/slurmctld/srun_comm.c index 3108bca7f52d4172fed6539efffe5d05399c311f..2032d116ed9e2048f12a1fc7c7fd3a3278a2e20f 100644 --- a/src/slurmctld/srun_comm.c +++ b/src/slurmctld/srun_comm.c @@ -93,6 +93,7 @@ extern void srun_allocate (uint32_t job_id) msg_arg = xmalloc(sizeof(resource_allocation_response_msg_t)); msg_arg->job_id = job_ptr->job_id; msg_arg->node_list = xstrdup(job_ptr->nodes); + msg_arg->alias_list = xstrdup(job_ptr->alias_list); msg_arg->num_cpu_groups = job_resrcs_ptr->cpu_array_cnt; msg_arg->cpus_per_node = xmalloc(sizeof(uint16_t) * job_resrcs_ptr->cpu_array_cnt); diff --git a/src/srun/srun.c b/src/srun/srun.c index ae31f92e372c4546edc2d4e19aaaa38bca88a319..a39dba1d5fd2a20f8d8e6176358e998105ad0db4 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -149,8 +149,9 @@ static void _pty_restore(void); static void _run_srun_prolog (srun_job_t *job); static void _run_srun_epilog (srun_job_t *job); static int _run_srun_script (srun_job_t *job, char *script); -static void _set_cpu_env_var(resource_allocation_response_msg_t *resp); +static void _set_env_vars(resource_allocation_response_msg_t *resp); static void _set_exit_code(void); +static void _set_node_alias(void); static void _step_opt_exclusive(void); static void _set_stdio_fds(srun_job_t *job, slurm_step_io_fds_t *cio_fds); static void _set_submit_dir_env(void); @@ -309,7 +310,7 @@ int srun(int ac, char **av) opt.alloc_nodelist = xstrdup(resp->node_list); if (opt.exclusive) _step_opt_exclusive(); - _set_cpu_env_var(resp); + _set_env_vars(resp); if (_validate_relative(resp)) exit(error_exit); job = job_step_create_allocation(resp); @@ -345,7 +346,7 @@ int srun(int ac, char **av) exit(error_exit); got_alloc = 1; _print_job_information(resp); - _set_cpu_env_var(resp); + _set_env_vars(resp); if (_validate_relative(resp)) { slurm_complete_job(resp->job_id, 1); exit(error_exit); @@ -438,6 +439,7 @@ int srun(int ac, char **av) setup_env(env, opt.preserve_env); xfree(env->task_count); xfree(env); + _set_node_alias(); re_launch: #if defined HAVE_BGQ @@ -779,19 +781,29 @@ static void _set_exit_code(void) } } -static void _set_cpu_env_var(resource_allocation_response_msg_t *resp) +static void _set_env_vars(resource_allocation_response_msg_t *resp) { char *tmp; - if (getenv("SLURM_JOB_CPUS_PER_NODE")) - return; + if (!getenv("SLURM_JOB_CPUS_PER_NODE")) { + tmp = uint32_compressed_to_str(resp->num_cpu_groups, + resp->cpus_per_node, + resp->cpu_count_reps); + if (setenvf(NULL, "SLURM_JOB_CPUS_PER_NODE", "%s", tmp) < 0) { + error("unable to set SLURM_JOB_CPUS_PER_NODE in " + "environment"); + } + xfree(tmp); + } + + if (!getenv("SLURM_NODE_ALIASES") && resp->alias_list) { + if (setenvf(NULL, "SLURM_NODE_ALIASES", "%s", + resp->alias_list) < 0) { + error("unable to set SLURM_NODE_ALIASES in " + "environment"); + } + } - tmp = uint32_compressed_to_str(resp->num_cpu_groups, - resp->cpus_per_node, - resp->cpu_count_reps); - if (setenvf(NULL, "SLURM_JOB_CPUS_PER_NODE", "%s", tmp) < 0) - error("unable to set SLURM_JOB_CPUS_PER_NODE in environment"); - xfree(tmp); return; } @@ -875,6 +887,27 @@ static int _set_rlimit_env(void) return rc; } +static void _set_node_alias(void) +{ + char *aliases, *save_ptr = NULL, *tmp; + char *addr, *name; + + tmp = getenv("SLURM_NODE_ALIASES"); + if (!tmp) + return; + aliases = xstrdup(tmp); + name = strtok_r(aliases, ":", &save_ptr); + while (name) { + addr = strtok_r(NULL, ",", &save_ptr); + if (addr) { + slurm_reset_alias(name, addr, addr); + name = strtok_r(NULL, ":", &save_ptr); + } else + name = NULL; + } + xfree(aliases); +} + static int _become_user (void) { char *user = uid_to_string(opt.uid);