Skip to content
Snippets Groups Projects
Commit a8e500b6 authored by Moe Jette's avatar Moe Jette
Browse files

Fix bug in setting host address for PMI communications (mpich2 only)

parent 933b3a64
No related branches found
No related tags found
No related merge requests found
...@@ -17,6 +17,7 @@ documents those changes that are of interest to users and admins. ...@@ -17,6 +17,7 @@ documents those changes that are of interest to users and admins.
-- Fix bug if sched/wiki or sched/wiki2 are configured and no job comment is -- Fix bug if sched/wiki or sched/wiki2 are configured and no job comment is
set. set.
-- scontrol modified to report partition partition's "DisableRootJobs" value. -- scontrol modified to report partition partition's "DisableRootJobs" value.
-- Fix bug in setting host address for PMI communications (mpich2 only).
-- NOTE: We needed to change an RPC from version 1.3.0. You must upgrade -- NOTE: We needed to change an RPC from version 1.3.0. You must upgrade
all nodes in a cluster from v1.3.0 to v1.3.1 at the same time. all nodes in a cluster from v1.3.0 to v1.3.1 at the same time.
......
...@@ -1431,9 +1431,12 @@ extern void slurm_step_launch_params_t_init ...@@ -1431,9 +1431,12 @@ extern void slurm_step_launch_params_t_init
/* /*
* slurm_step_launch - launch a parallel job step * slurm_step_launch - launch a parallel job step
* IN ctx - job step context generated by slurm_step_ctx_create * IN ctx - job step context generated by slurm_step_ctx_create
* IN launcher_host - address used for PMI communications
* IN callbacks - Identify functions to be called when various events occur
* RET SLURM_SUCCESS or SLURM_ERROR (with errno set) * RET SLURM_SUCCESS or SLURM_ERROR (with errno set)
*/ */
extern int slurm_step_launch PARAMS((slurm_step_ctx_t *ctx, extern int slurm_step_launch PARAMS((slurm_step_ctx_t *ctx,
char *launcher_host,
const slurm_step_launch_params_t *params, const slurm_step_launch_params_t *params,
const slurm_step_launch_callbacks_t *callbacks)); const slurm_step_launch_callbacks_t *callbacks));
......
...@@ -124,10 +124,11 @@ void slurm_step_launch_params_t_init (slurm_step_launch_params_t *ptr) ...@@ -124,10 +124,11 @@ void slurm_step_launch_params_t_init (slurm_step_launch_params_t *ptr)
/* /*
* slurm_step_launch - launch a parallel job step * slurm_step_launch - launch a parallel job step
* IN ctx - job step context generated by slurm_step_ctx_create * IN ctx - job step context generated by slurm_step_ctx_create
* IN launcher_host - address used for PMI communications
* IN callbacks - Identify functions to be called when various events occur * IN callbacks - Identify functions to be called when various events occur
* RET SLURM_SUCCESS or SLURM_ERROR (with errno set) * RET SLURM_SUCCESS or SLURM_ERROR (with errno set)
*/ */
int slurm_step_launch (slurm_step_ctx_t *ctx, int slurm_step_launch (slurm_step_ctx_t *ctx, char *launcher_host,
const slurm_step_launch_params_t *params, const slurm_step_launch_params_t *params,
const slurm_step_launch_callbacks_t *callbacks) const slurm_step_launch_callbacks_t *callbacks)
{ {
...@@ -142,7 +143,6 @@ int slurm_step_launch (slurm_step_ctx_t *ctx, ...@@ -142,7 +143,6 @@ int slurm_step_launch (slurm_step_ctx_t *ctx,
if (ctx == NULL || ctx->magic != STEP_CTX_MAGIC) { if (ctx == NULL || ctx->magic != STEP_CTX_MAGIC) {
error("Not a valid slurm_step_ctx_t!"); error("Not a valid slurm_step_ctx_t!");
slurm_seterrno(EINVAL); slurm_seterrno(EINVAL);
return SLURM_ERROR; return SLURM_ERROR;
} }
...@@ -194,18 +194,8 @@ int slurm_step_launch (slurm_step_ctx_t *ctx, ...@@ -194,18 +194,8 @@ int slurm_step_launch (slurm_step_ctx_t *ctx,
} else { } else {
env_array_merge(&env, (const char **)params->env); env_array_merge(&env, (const char **)params->env);
} }
{ env_array_for_step(&env, ctx->step_resp, launcher_host,
/* FIXME - hostname and IP need to be user settable */ ctx->launch_state->resp_port[0]);
char *launcher_hostname = xshort_hostname();
struct hostent *ent = gethostbyname(launcher_hostname);
env_array_for_step(&env,
ctx->step_resp,
launcher_hostname,
ctx->launch_state->resp_port[0],
ent->h_addr_list[0]);
xfree(launcher_hostname);
}
env_array_merge(&env, (const char **)mpi_env); env_array_merge(&env, (const char **)mpi_env);
env_array_free(mpi_env); env_array_free(mpi_env);
......
...@@ -898,8 +898,7 @@ void ...@@ -898,8 +898,7 @@ void
env_array_for_step(char ***dest, env_array_for_step(char ***dest,
const job_step_create_response_msg_t *step, const job_step_create_response_msg_t *step,
const char *launcher_hostname, const char *launcher_hostname,
uint16_t launcher_port, uint16_t launcher_port)
const char *ip_addr_str)
{ {
char *tmp; char *tmp;
...@@ -917,8 +916,6 @@ env_array_for_step(char ***dest, ...@@ -917,8 +916,6 @@ env_array_for_step(char ***dest,
"%s", launcher_hostname); "%s", launcher_hostname);
env_array_overwrite_fmt(dest, "SLURM_STEP_LAUNCHER_PORT", env_array_overwrite_fmt(dest, "SLURM_STEP_LAUNCHER_PORT",
"%hu", launcher_port); "%hu", launcher_port);
/* env_array_overwrite_fmt(dest, "SLURM_STEP_LAUNCHER_IPADDR", */
/* "%s", ip_addr_str); */
/* OBSOLETE */ /* OBSOLETE */
env_array_overwrite_fmt(dest, "SLURM_STEPID", "%u", step->job_step_id); env_array_overwrite_fmt(dest, "SLURM_STEPID", "%u", step->job_step_id);
...@@ -931,8 +928,6 @@ env_array_for_step(char ***dest, ...@@ -931,8 +928,6 @@ env_array_for_step(char ***dest,
"%s", launcher_hostname); "%s", launcher_hostname);
env_array_overwrite_fmt(dest, "SLURM_SRUN_COMM_PORT", env_array_overwrite_fmt(dest, "SLURM_SRUN_COMM_PORT",
"%hu", launcher_port); "%hu", launcher_port);
/* env_array_overwrite_fmt(dest, "SLURM_LAUNCH_NODE_IPADDR", */
/* "%s", ip_addr_str); */
xfree(tmp); xfree(tmp);
} }
......
...@@ -165,8 +165,7 @@ void ...@@ -165,8 +165,7 @@ void
env_array_for_step(char ***dest, env_array_for_step(char ***dest,
const job_step_create_response_msg_t *step, const job_step_create_response_msg_t *step,
const char *launcher_hostname, const char *launcher_hostname,
uint16_t launcher_port, uint16_t launcher_port);
const char *ip_addr_str);
/* /*
* Return an empty environment variable array (contains a single * Return an empty environment variable array (contains a single
......
...@@ -290,7 +290,7 @@ int srun(int ac, char **av) ...@@ -290,7 +290,7 @@ int srun(int ac, char **av)
env->labelio = opt.labelio; env->labelio = opt.labelio;
env->comm_port = slurmctld_comm_addr.port; env->comm_port = slurmctld_comm_addr.port;
env->comm_hostname = slurmctld_comm_addr.hostname; env->comm_hostname = slurmctld_comm_addr.hostname;
if(job) { if (job) {
uint16_t *tasks = NULL; uint16_t *tasks = NULL;
slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS, slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS,
&tasks); &tasks);
...@@ -383,8 +383,8 @@ int srun(int ac, char **av) ...@@ -383,8 +383,8 @@ int srun(int ac, char **av)
} }
update_job_state(job, SRUN_JOB_LAUNCHING); update_job_state(job, SRUN_JOB_LAUNCHING);
if (slurm_step_launch(job->step_ctx, &launch_params, &callbacks) if (slurm_step_launch(job->step_ctx, slurmctld_comm_addr.hostname,
!= SLURM_SUCCESS) { &launch_params, &callbacks) != SLURM_SUCCESS) {
error("Application launch failed: %m"); error("Application launch failed: %m");
goto cleanup; goto cleanup;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment