From a8e500b68a190ac7f2c4bb195f5f2d40a64d914f Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 22 Apr 2008 19:08:49 +0000 Subject: [PATCH] Fix bug in setting host address for PMI communications (mpich2 only) --- NEWS | 1 + slurm/slurm.h.in | 3 +++ src/api/step_launch.c | 18 ++++-------------- src/common/env.c | 7 +------ src/common/env.h | 3 +-- src/srun/srun.c | 6 +++--- 6 files changed, 13 insertions(+), 25 deletions(-) diff --git a/NEWS b/NEWS index ae7b98d43ba..fcc00de197c 100644 --- a/NEWS +++ b/NEWS @@ -17,6 +17,7 @@ documents those changes that are of interest to users and admins. -- Fix bug if sched/wiki or sched/wiki2 are configured and no job comment is set. -- scontrol modified to report partition partition's "DisableRootJobs" value. + -- Fix bug in setting host address for PMI communications (mpich2 only). -- NOTE: We needed to change an RPC from version 1.3.0. You must upgrade all nodes in a cluster from v1.3.0 to v1.3.1 at the same time. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index e74e1c8e16c..11286cc89dc 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1431,9 +1431,12 @@ extern void slurm_step_launch_params_t_init /* * slurm_step_launch - launch a parallel job step * IN ctx - job step context generated by slurm_step_ctx_create + * IN launcher_host - address used for PMI communications + * IN callbacks - Identify functions to be called when various events occur * RET SLURM_SUCCESS or SLURM_ERROR (with errno set) */ extern int slurm_step_launch PARAMS((slurm_step_ctx_t *ctx, + char *launcher_host, const slurm_step_launch_params_t *params, const slurm_step_launch_callbacks_t *callbacks)); diff --git a/src/api/step_launch.c b/src/api/step_launch.c index 6968fde3d97..244c1f35324 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -124,10 +124,11 @@ void slurm_step_launch_params_t_init (slurm_step_launch_params_t *ptr) /* * slurm_step_launch - launch a parallel job step * IN ctx - job step context generated by slurm_step_ctx_create + * IN launcher_host - address used for PMI communications * IN callbacks - Identify functions to be called when various events occur * RET SLURM_SUCCESS or SLURM_ERROR (with errno set) */ -int slurm_step_launch (slurm_step_ctx_t *ctx, +int slurm_step_launch (slurm_step_ctx_t *ctx, char *launcher_host, const slurm_step_launch_params_t *params, const slurm_step_launch_callbacks_t *callbacks) { @@ -142,7 +143,6 @@ int slurm_step_launch (slurm_step_ctx_t *ctx, if (ctx == NULL || ctx->magic != STEP_CTX_MAGIC) { error("Not a valid slurm_step_ctx_t!"); - slurm_seterrno(EINVAL); return SLURM_ERROR; } @@ -194,18 +194,8 @@ int slurm_step_launch (slurm_step_ctx_t *ctx, } else { env_array_merge(&env, (const char **)params->env); } - { - /* FIXME - hostname and IP need to be user settable */ - char *launcher_hostname = xshort_hostname(); - struct hostent *ent = gethostbyname(launcher_hostname); - - env_array_for_step(&env, - ctx->step_resp, - launcher_hostname, - ctx->launch_state->resp_port[0], - ent->h_addr_list[0]); - xfree(launcher_hostname); - } + env_array_for_step(&env, ctx->step_resp, launcher_host, + ctx->launch_state->resp_port[0]); env_array_merge(&env, (const char **)mpi_env); env_array_free(mpi_env); diff --git a/src/common/env.c b/src/common/env.c index 8a866797c8c..a76e161ffbf 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -898,8 +898,7 @@ void env_array_for_step(char ***dest, const job_step_create_response_msg_t *step, const char *launcher_hostname, - uint16_t launcher_port, - const char *ip_addr_str) + uint16_t launcher_port) { char *tmp; @@ -917,8 +916,6 @@ env_array_for_step(char ***dest, "%s", launcher_hostname); env_array_overwrite_fmt(dest, "SLURM_STEP_LAUNCHER_PORT", "%hu", launcher_port); -/* env_array_overwrite_fmt(dest, "SLURM_STEP_LAUNCHER_IPADDR", */ -/* "%s", ip_addr_str); */ /* OBSOLETE */ env_array_overwrite_fmt(dest, "SLURM_STEPID", "%u", step->job_step_id); @@ -931,8 +928,6 @@ env_array_for_step(char ***dest, "%s", launcher_hostname); env_array_overwrite_fmt(dest, "SLURM_SRUN_COMM_PORT", "%hu", launcher_port); -/* env_array_overwrite_fmt(dest, "SLURM_LAUNCH_NODE_IPADDR", */ -/* "%s", ip_addr_str); */ xfree(tmp); } diff --git a/src/common/env.h b/src/common/env.h index 0790f671c61..f59adaf17dd 100644 --- a/src/common/env.h +++ b/src/common/env.h @@ -165,8 +165,7 @@ void env_array_for_step(char ***dest, const job_step_create_response_msg_t *step, const char *launcher_hostname, - uint16_t launcher_port, - const char *ip_addr_str); + uint16_t launcher_port); /* * Return an empty environment variable array (contains a single diff --git a/src/srun/srun.c b/src/srun/srun.c index c860f406d7c..3a67fc6f700 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -290,7 +290,7 @@ int srun(int ac, char **av) env->labelio = opt.labelio; env->comm_port = slurmctld_comm_addr.port; env->comm_hostname = slurmctld_comm_addr.hostname; - if(job) { + if (job) { uint16_t *tasks = NULL; slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS, &tasks); @@ -383,8 +383,8 @@ int srun(int ac, char **av) } update_job_state(job, SRUN_JOB_LAUNCHING); - if (slurm_step_launch(job->step_ctx, &launch_params, &callbacks) - != SLURM_SUCCESS) { + if (slurm_step_launch(job->step_ctx, slurmctld_comm_addr.hostname, + &launch_params, &callbacks) != SLURM_SUCCESS) { error("Application launch failed: %m"); goto cleanup; } -- GitLab