From a8e500b68a190ac7f2c4bb195f5f2d40a64d914f Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 22 Apr 2008 19:08:49 +0000
Subject: [PATCH] Fix bug in setting host address for PMI communications
 (mpich2 only)

---
 NEWS                  |  1 +
 slurm/slurm.h.in      |  3 +++
 src/api/step_launch.c | 18 ++++--------------
 src/common/env.c      |  7 +------
 src/common/env.h      |  3 +--
 src/srun/srun.c       |  6 +++---
 6 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/NEWS b/NEWS
index ae7b98d43ba..fcc00de197c 100644
--- a/NEWS
+++ b/NEWS
@@ -17,6 +17,7 @@ documents those changes that are of interest to users and admins.
  -- Fix bug if sched/wiki or sched/wiki2 are configured and no job comment is 
     set.
  -- scontrol modified to report partition partition's "DisableRootJobs" value.
+ -- Fix bug in setting host address for PMI communications (mpich2 only).
  -- NOTE: We needed to change an RPC from version 1.3.0. You must upgrade 
     all nodes in a cluster from v1.3.0 to v1.3.1 at the same time.
 
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index e74e1c8e16c..11286cc89dc 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -1431,9 +1431,12 @@ extern void slurm_step_launch_params_t_init
 /*
  * slurm_step_launch - launch a parallel job step
  * IN ctx - job step context generated by slurm_step_ctx_create
+ * IN launcher_host - address used for PMI communications
+ * IN callbacks - Identify functions to be called when various events occur
  * RET SLURM_SUCCESS or SLURM_ERROR (with errno set)
  */
 extern int slurm_step_launch PARAMS((slurm_step_ctx_t *ctx,
+	char *launcher_host,
 	const slurm_step_launch_params_t *params,
 	const slurm_step_launch_callbacks_t *callbacks));
 
diff --git a/src/api/step_launch.c b/src/api/step_launch.c
index 6968fde3d97..244c1f35324 100644
--- a/src/api/step_launch.c
+++ b/src/api/step_launch.c
@@ -124,10 +124,11 @@ void slurm_step_launch_params_t_init (slurm_step_launch_params_t *ptr)
 /*
  * slurm_step_launch - launch a parallel job step
  * IN ctx - job step context generated by slurm_step_ctx_create
+ * IN launcher_host - address used for PMI communications
  * IN callbacks - Identify functions to be called when various events occur
  * RET SLURM_SUCCESS or SLURM_ERROR (with errno set)
  */
-int slurm_step_launch (slurm_step_ctx_t *ctx,
+int slurm_step_launch (slurm_step_ctx_t *ctx, char *launcher_host,
 		       const slurm_step_launch_params_t *params,
 		       const slurm_step_launch_callbacks_t *callbacks)
 {
@@ -142,7 +143,6 @@ int slurm_step_launch (slurm_step_ctx_t *ctx,
 
 	if (ctx == NULL || ctx->magic != STEP_CTX_MAGIC) {
 		error("Not a valid slurm_step_ctx_t!");
-
 		slurm_seterrno(EINVAL);
 		return SLURM_ERROR;
 	}
@@ -194,18 +194,8 @@ int slurm_step_launch (slurm_step_ctx_t *ctx,
 	} else {
 		env_array_merge(&env, (const char **)params->env);
 	}
-	{
-		/* FIXME - hostname and IP need to be user settable */
-		char *launcher_hostname = xshort_hostname();
-		struct hostent *ent = gethostbyname(launcher_hostname);
-
-		env_array_for_step(&env,
-				   ctx->step_resp,
-				   launcher_hostname,
-				   ctx->launch_state->resp_port[0],
-				   ent->h_addr_list[0]);
-		xfree(launcher_hostname);
-	}
+	env_array_for_step(&env, ctx->step_resp, launcher_host,
+			   ctx->launch_state->resp_port[0]);
 	env_array_merge(&env, (const char **)mpi_env);
 	env_array_free(mpi_env);
 
diff --git a/src/common/env.c b/src/common/env.c
index 8a866797c8c..a76e161ffbf 100644
--- a/src/common/env.c
+++ b/src/common/env.c
@@ -898,8 +898,7 @@ void
 env_array_for_step(char ***dest, 
 		   const job_step_create_response_msg_t *step,
 		   const char *launcher_hostname,
-		   uint16_t launcher_port,
-		   const char *ip_addr_str)
+		   uint16_t launcher_port)
 {
 	char *tmp;
 
@@ -917,8 +916,6 @@ env_array_for_step(char ***dest,
 			 "%s", launcher_hostname);
 	env_array_overwrite_fmt(dest, "SLURM_STEP_LAUNCHER_PORT",
 			 "%hu", launcher_port);
-/* 	env_array_overwrite_fmt(dest, "SLURM_STEP_LAUNCHER_IPADDR", */
-/* 			 "%s", ip_addr_str); */
 
 	/* OBSOLETE */
 	env_array_overwrite_fmt(dest, "SLURM_STEPID", "%u", step->job_step_id);
@@ -931,8 +928,6 @@ env_array_for_step(char ***dest,
 			 "%s", launcher_hostname);
 	env_array_overwrite_fmt(dest, "SLURM_SRUN_COMM_PORT",
 			 "%hu", launcher_port);
-/* 	env_array_overwrite_fmt(dest, "SLURM_LAUNCH_NODE_IPADDR", */
-/* 			 "%s", ip_addr_str); */
 
 	xfree(tmp);
 }
diff --git a/src/common/env.h b/src/common/env.h
index 0790f671c61..f59adaf17dd 100644
--- a/src/common/env.h
+++ b/src/common/env.h
@@ -165,8 +165,7 @@ void
 env_array_for_step(char ***dest,
 		   const job_step_create_response_msg_t *step,
 		   const char *launcher_hostname,
-		   uint16_t launcher_port,
-		   const char *ip_addr_str);
+		   uint16_t launcher_port);
 
 /*
  * Return an empty environment variable array (contains a single
diff --git a/src/srun/srun.c b/src/srun/srun.c
index c860f406d7c..3a67fc6f700 100644
--- a/src/srun/srun.c
+++ b/src/srun/srun.c
@@ -290,7 +290,7 @@ int srun(int ac, char **av)
 	env->labelio = opt.labelio;
 	env->comm_port = slurmctld_comm_addr.port;
 	env->comm_hostname = slurmctld_comm_addr.hostname;
-	if(job) {
+	if (job) {
 		uint16_t *tasks = NULL;
 		slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS, 
 				   &tasks);
@@ -383,8 +383,8 @@ int srun(int ac, char **av)
 	}
 
 	update_job_state(job, SRUN_JOB_LAUNCHING);
-	if (slurm_step_launch(job->step_ctx, &launch_params, &callbacks)
-	    != SLURM_SUCCESS) {
+	if (slurm_step_launch(job->step_ctx, slurmctld_comm_addr.hostname, 
+	    &launch_params, &callbacks) != SLURM_SUCCESS) {
 		error("Application launch failed: %m");
 		goto cleanup;
 	}
-- 
GitLab