From 91337c3fea7a1f7ceef0cd80c0eecbdde8402226 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Tue, 29 Aug 2017 14:08:26 -0600
Subject: [PATCH] Set SLURM_JOBID to be pack job leader

This applies to job steps for MPI. Even if there is more than one
  pack job component in a single MPI_COMM_WORLD, they will share
  a common SLURM_JOBID.
---
 slurm/slurm.h.in                        |  1 +
 src/api/step_launch.c                   |  9 +++++++--
 src/common/slurm_protocol_defs.h        |  1 +
 src/common/slurm_protocol_pack.c        |  4 ++++
 src/plugins/launch/slurm/launch_slurm.c |  1 +
 src/slurmd/slurmstepd/slurmstepd_job.c  |  4 +++-
 src/slurmd/slurmstepd/slurmstepd_job.h  |  1 +
 src/slurmd/slurmstepd/task.c            |  5 ++++-
 src/srun/libsrun/srun_job.c             | 26 ++++++++++++++++++++-----
 src/srun/libsrun/srun_job.h             |  1 +
 src/srun/srun.c                         |  5 ++++-
 11 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 66d0a9c2938..b88389a9611 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -1928,6 +1928,7 @@ typedef struct {
 	bool no_alloc;
 	uint32_t slurmd_debug;  /* remote slurmd debug level */
 	uint32_t node_offset;	/* Pack job node offset or NO_VAL */
+	uint32_t pack_jobid;	/* pack job ID or NO_VAL */
 	uint32_t pack_ntasks;	/* total task count for entire pack job */
 	uint32_t pack_offset;	/* Pack job offset or NO_VAL */
 	uint32_t task_offset;	/* Pack job tsk offset or NO_VAL */
diff --git a/src/api/step_launch.c b/src/api/step_launch.c
index 97598ff160a..fa6cd9ca5a7 100644
--- a/src/api/step_launch.c
+++ b/src/api/step_launch.c
@@ -165,6 +165,7 @@ extern void slurm_step_launch_params_t_init(slurm_step_launch_params_t *ptr)
 	ptr->cpu_freq_max = NO_VAL;
 	ptr->cpu_freq_gov = NO_VAL;
 	ptr->node_offset  = NO_VAL;
+	ptr->pack_jobid   = NO_VAL;
 	ptr->pack_ntasks  = NO_VAL;
 	ptr->pack_offset  = NO_VAL;
 	ptr->task_offset  = NO_VAL;
@@ -294,6 +295,7 @@ extern int slurm_step_launch(slurm_step_ctx_t *ctx,
 	launch.cred = ctx->step_resp->cred;
 	launch.job_step_id = ctx->step_resp->job_step_id;
 	launch.node_offset = params->node_offset;
+	launch.pack_jobid  = params->pack_jobid;
 	launch.pack_ntasks = params->pack_ntasks;
 	launch.pack_offset = params->pack_offset;
 	launch.task_offset = params->task_offset;
@@ -500,12 +502,15 @@ extern int slurm_step_launch_add(slurm_step_ctx_t *ctx,
 	launch.spank_job_env_size = params->spank_job_env_size;
 	launch.cred = ctx->step_resp->cred;
 	launch.job_step_id = ctx->step_resp->job_step_id;
+	launch.pack_jobid  = params->pack_jobid;
 	launch.pack_ntasks = params->pack_ntasks;
 	launch.pack_offset = params->pack_offset;
 	launch.task_offset = params->task_offset;
 	if (params->env == NULL) {
-		/* if the user didn't specify an environment, grab the
-		 * environment of the running process */
+		/*
+		 * if the user didn't specify an environment, grab the
+		 * environment of the running process
+		 */
 		env_array_merge(&env, (const char **)environ);
 	} else {
 		env_array_merge(&env, (const char **)params->env);
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index 98b72fca17a..213de4f3102 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -767,6 +767,7 @@ typedef struct launch_tasks_request_msg {
 	uint32_t  job_id;
 	uint32_t  job_step_id;
 	uint32_t  node_offset;	/* pack job node offset of NO_VAL */
+	uint32_t  pack_jobid;	/* pack job ID or NO_VAL */
 	uint32_t  pack_ntasks;	/* total task count for entire pack job */
 	uint32_t  pack_offset;	/* pack job offset of NO_VAL */
 	uint32_t  task_offset;	/* pack job task ID offset of NO_VAL */
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index a1d2dc2a0cf..7e87b22707e 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -10262,6 +10262,7 @@ _pack_launch_tasks_request_msg(launch_tasks_request_msg_t * msg, Buf buffer,
 		pack32(msg->job_id, buffer);
 		pack32(msg->job_step_id, buffer);
 		pack32(msg->node_offset, buffer);
+		pack32(msg->pack_jobid, buffer);
 		pack32(msg->pack_ntasks, buffer);
 		pack32(msg->pack_offset, buffer);
 		pack32(msg->task_offset, buffer);
@@ -10549,6 +10550,7 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t **
 		safe_unpack32(&msg->job_id, buffer);
 		safe_unpack32(&msg->job_step_id, buffer);
 		safe_unpack32(&msg->node_offset, buffer);
+		safe_unpack32(&msg->pack_jobid, buffer);
 		safe_unpack32(&msg->pack_ntasks, buffer);
 		safe_unpack32(&msg->pack_offset, buffer);
 		safe_unpack32(&msg->task_offset, buffer);
@@ -10661,6 +10663,7 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t **
 		safe_unpack32(&msg->job_id, buffer);
 		safe_unpack32(&msg->job_step_id, buffer);
 		msg->node_offset = NO_VAL;
+		msg->pack_jobid  = NO_VAL;
 		msg->pack_ntasks = NO_VAL;
 		msg->pack_offset = NO_VAL;
 		msg->task_offset = NO_VAL;
@@ -10792,6 +10795,7 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t **
 		safe_unpack32(&msg->job_id, buffer);
 		safe_unpack32(&msg->job_step_id, buffer);
 		msg->node_offset = NO_VAL;
+		msg->pack_jobid  = NO_VAL;
 		msg->pack_ntasks = NO_VAL;
 		msg->pack_offset = NO_VAL;
 		msg->task_offset = NO_VAL;
diff --git a/src/plugins/launch/slurm/launch_slurm.c b/src/plugins/launch/slurm/launch_slurm.c
index 0bd7c617d9a..82cc84fce8b 100644
--- a/src/plugins/launch/slurm/launch_slurm.c
+++ b/src/plugins/launch/slurm/launch_slurm.c
@@ -674,6 +674,7 @@ extern int launch_p_step_launch(srun_job_t *job, slurm_step_io_fds_t *cio_fds,
 	launch_params.remote_input_filename = fname_remote_string(job->ifname);
 	launch_params.remote_error_filename = fname_remote_string(job->efname);
 	launch_params.node_offset = job->node_offset;
+	launch_params.pack_jobid  = job->pack_jobid;
 	launch_params.pack_ntasks = job->pack_ntasks;
 	launch_params.pack_offset = job->pack_offset;
 	launch_params.task_offset = job->task_offset;
diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c
index ec2164b73d6..f71f654feaf 100644
--- a/src/slurmd/slurmstepd/slurmstepd_job.c
+++ b/src/slurmd/slurmstepd/slurmstepd_job.c
@@ -309,6 +309,7 @@ extern stepd_step_rec_t *stepd_step_rec_create(launch_tasks_request_msg_t *msg,
 	job->array_job_id  = msg->job_id;
 	job->array_task_id = NO_VAL;
 	job->node_offset = msg->node_offset;	/* Used for env vars */
+	job->pack_jobid  = msg->pack_jobid;	/* Used for env vars */
 	job->pack_ntasks = msg->pack_ntasks;	/* Used for env vars */
 	job->pack_offset = msg->pack_offset;	/* Used for env vars & labels */
 	job->task_offset = msg->task_offset;	/* Used for env vars & labels */
@@ -472,7 +473,8 @@ batch_stepd_step_rec_create(batch_job_launch_msg_t *msg)
 	job->stepid  = msg->step_id;
 	job->array_job_id  = msg->array_job_id;
 	job->array_task_id = msg->array_task_id;
-	job->pack_offset = NO_VAL;	/* Used to set output labels */
+	job->pack_jobid  = NO_VAL;	/* Used to set env vars */
+	job->pack_offset = NO_VAL;	/* Used to set labels and env vars */
 	job->job_core_spec = msg->job_core_spec;
 
 	job->batch   = true;
diff --git a/src/slurmd/slurmstepd/slurmstepd_job.h b/src/slurmd/slurmstepd/slurmstepd_job.h
index 702a65b2910..68ec5e83301 100644
--- a/src/slurmd/slurmstepd/slurmstepd_job.h
+++ b/src/slurmd/slurmstepd/slurmstepd_job.h
@@ -135,6 +135,7 @@ typedef struct {
 	uint32_t       nodeid; /* relative position of this node in job     */
 	uint32_t       node_offset; 	/* pack job node offset or NO_VAL   */
 	uint32_t       node_tasks; /* number of tasks on *this* node        */
+	uint32_t       pack_jobid;	/* pack job ID or NO_VAL */
 	uint32_t       pack_ntasks;	/* total task count for entire pack job */
 	uint32_t       pack_offset; 	/* pack job offset or NO_VAL        */
 	uint32_t       task_offset; 	/* pack job task offset or NO_VAL   */
diff --git a/src/slurmd/slurmstepd/task.c b/src/slurmd/slurmstepd/task.c
index 3fa2836e789..8168cac272b 100644
--- a/src/slurmd/slurmstepd/task.c
+++ b/src/slurmd/slurmstepd/task.c
@@ -357,7 +357,10 @@ extern void exec_task(stepd_step_rec_t *job, int i)
 	job->envtp->sgtids = _uint32_array_to_str(job->node_tasks, gtids);
 	xfree(gtids);
 
-	job->envtp->jobid = job->jobid;
+	if (job->pack_jobid != NO_VAL)
+		job->envtp->jobid = job->pack_jobid;
+	else
+		job->envtp->jobid = job->jobid;
 	job->envtp->stepid = job->stepid;
 	job->envtp->nodeid = job->nodeid + node_offset;
 	job->envtp->cpus_on_node = job->cpus;
diff --git a/src/srun/libsrun/srun_job.c b/src/srun/libsrun/srun_job.c
index 0bb321b5bec..9f1374684cd 100644
--- a/src/srun/libsrun/srun_job.c
+++ b/src/srun/libsrun/srun_job.c
@@ -365,7 +365,8 @@ extern srun_job_t *job_step_create_allocation(
 		buf = hostlist_ranged_string_xmalloc(hl);
 		count = hostlist_count(hl);
 		hostlist_destroy(hl);
-		/* Don't reset the ai->nodelist because that is the
+		/*
+		 * Don't reset the ai->nodelist because that is the
 		 * nodelist we want to say the allocation is under
 		 * opt_local->nodelist is what is used for the allocation.
 		 */
@@ -675,7 +676,7 @@ static void _set_step_opts(opt_t *opt_local)
  * the job allocation request with its requested options.
  */
 static int _create_job_step(srun_job_t *job, bool use_all_cpus,
-			    List srun_job_list)
+			    List srun_job_list, uint32_t pack_jobid)
 {
 	ListIterator opt_iter = NULL, job_iter;
 	opt_t *opt_local = &opt;
@@ -688,6 +689,8 @@ static int _create_job_step(srun_job_t *job, bool use_all_cpus,
 			opt_iter = list_iterator_create(opt_list);
 		job_iter = list_iterator_create(srun_job_list);
 		while ((job = (srun_job_t *) list_next(job_iter))) {
+			if (pack_jobid)
+				job->pack_jobid = pack_jobid;
 			job->stepid = NO_VAL;
 			pack_ntasks += job->ntasks;
 		}
@@ -762,7 +765,7 @@ extern void create_srun_job(void **p_job, bool *got_alloc,
 	srun_job_t *job = NULL;
 	int i, max_list_offset, max_pack_offset, pack_offset = -1;
 	opt_t *opt_local;
-	uint32_t my_job_id = 0;
+	uint32_t my_job_id = 0, pack_jobid = 0;
 	bool begin_error_logged = false;
 	bool core_spec_error_logged = false;
 #ifdef HAVE_NATIVE_CRAY
@@ -924,8 +927,14 @@ extern void create_srun_job(void **p_job, bool *got_alloc,
 			      max_pack_offset, pack_offset);
 			exit(error_exit);
 		}
+		if (srun_job_list && (list_count(srun_job_list) > 1) &&
+		    opt_list && (list_count(opt_list) > 1) &&
+		    my_job_id && (opt.mpi_combine == true)) {
+			pack_jobid = my_job_id;
+		}
 
-		if (_create_job_step(job, false, srun_job_list) < 0) {
+		if (_create_job_step(job, false, srun_job_list, pack_jobid)
+		    < 0) {
 			if (*got_alloc)
 				slurm_complete_job(my_job_id, 1);
 			else
@@ -994,6 +1003,11 @@ extern void create_srun_job(void **p_job, bool *got_alloc,
 			job = job_create_allocation(resp, &opt);
 			_set_step_opts(&opt);
 		}
+		if (srun_job_list && (list_count(srun_job_list) > 1) &&
+		    opt_list && (list_count(opt_list) > 1) &&
+		    my_job_id && (opt.mpi_combine == true)) {
+			pack_jobid = my_job_id;
+		}
 
 		/*
 		 *  Become --uid user
@@ -1001,7 +1015,8 @@ extern void create_srun_job(void **p_job, bool *got_alloc,
 		if (_become_user () < 0)
 			info("Warning: Unable to assume uid=%u", opt.uid);
 
-		if (_create_job_step(job, true, srun_job_list) < 0) {
+		if (_create_job_step(job, true, srun_job_list, pack_jobid)
+		    < 0) {
 			slurm_complete_job(my_job_id, 1);
 			exit(error_exit);
 		}
@@ -1196,6 +1211,7 @@ static srun_job_t *_job_create_structure(allocation_info_t *ainfo,
  	job->nodelist = xstrdup(ainfo->nodelist);
  	job->partition = xstrdup(ainfo->partition);
 	job->stepid  = ainfo->stepid;
+ 	job->pack_jobid  = NO_VAL;
  	job->pack_ntasks = NO_VAL;
  	job->pack_offset = NO_VAL;
  	job->task_offset = NO_VAL;
diff --git a/src/srun/libsrun/srun_job.h b/src/srun/libsrun/srun_job.h
index 504f6945145..39b3430ac72 100644
--- a/src/srun/libsrun/srun_job.h
+++ b/src/srun/libsrun/srun_job.h
@@ -82,6 +82,7 @@ typedef struct srun_job {
 	uint32_t jobid;		/* assigned job id 	                  */
 	uint32_t stepid;	/* assigned step id 	                  */
 	uint32_t node_offset;	/* pack job node offset or NO_VAL */
+	uint32_t pack_jobid;	/* pack job leader or NO_VAL */
 	uint32_t pack_ntasks;	/* total task count for entire pack job */
 	uint32_t pack_offset;	/* pack job offset or NO_VAL */
 	uint32_t task_offset;	/* pack job tsk offset or NO_VAL */
diff --git a/src/srun/srun.c b/src/srun/srun.c
index 4c90c5a0c4a..d638e2ce8fb 100644
--- a/src/srun/srun.c
+++ b/src/srun/srun.c
@@ -390,7 +390,10 @@ static void _setup_one_job_env(opt_t *opt_local, srun_job_t *job,
 		if (job->pack_ntasks != NO_VAL)
 			env->ntasks = job->pack_ntasks;
 		env->task_count = _uint16_array_to_str(job->nhosts, tasks);
-		env->jobid = job->jobid;
+		if (job->pack_jobid != NO_VAL)
+			env->jobid = job->pack_jobid;
+		else
+			env->jobid = job->jobid;
 		env->stepid = job->stepid;
 		env->account = job->account;
 		env->qos = job->qos;
-- 
GitLab