From 822b3da8562b51b62f41634aace827c7909c73c7 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Wed, 16 Aug 2017 15:52:58 -0600 Subject: [PATCH] Correct some recent pack job env var work --- src/api/step_launch.c | 2 ++ src/common/slurm_protocol_defs.h | 1 + src/common/slurm_protocol_pack.c | 4 ++++ src/slurmd/slurmstepd/mgr.c | 11 ++++++----- src/slurmd/slurmstepd/slurmstepd_job.c | 1 + src/slurmd/slurmstepd/task.c | 8 ++++---- src/srun/libsrun/srun_job.c | 4 ++-- 7 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/api/step_launch.c b/src/api/step_launch.c index e2bb17bd3e8..97116dde97d 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -292,6 +292,7 @@ extern int slurm_step_launch(slurm_step_ctx_t *ctx, launch.job_step_id = ctx->step_resp->job_step_id; launch.pack_ntasks = params->pack_ntasks; launch.pack_offset = params->pack_offset; + launch.task_offset = params->task_offset; if (params->env == NULL) { /* if the user didn't specify an environment, grab the * environment of the running process */ @@ -495,6 +496,7 @@ extern int slurm_step_launch_add(slurm_step_ctx_t *ctx, launch.job_step_id = ctx->step_resp->job_step_id; launch.pack_ntasks = params->pack_ntasks; launch.pack_offset = params->pack_offset; + launch.task_offset = params->task_offset; if (params->env == NULL) { /* if the user didn't specify an environment, grab the * environment of the running process */ diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 55a3fc53a5f..eb8e02c8513 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -768,6 +768,7 @@ typedef struct launch_tasks_request_msg { uint32_t job_step_id; uint32_t pack_ntasks; /* total task count for entire pack job */ uint32_t pack_offset; /* pack job offset of NO_VAL */ + uint32_t task_offset; /* pack job task ID offset of NO_VAL */ uint32_t nnodes; /* number of nodes in this job step */ uint32_t ntasks; /* number of tasks in this job step */ uint16_t ntasks_per_board;/* number of tasks to invoke on each board */ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index f7ba69b422e..668735d65f7 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -10252,6 +10252,7 @@ _pack_launch_tasks_request_msg(launch_tasks_request_msg_t * msg, Buf buffer, pack32(msg->job_step_id, buffer); pack32(msg->pack_ntasks, buffer); pack32(msg->pack_offset, buffer); + pack32(msg->task_offset, buffer); pack32(msg->ntasks, buffer); pack16(msg->ntasks_per_board, buffer); pack16(msg->ntasks_per_core, buffer); @@ -10537,6 +10538,7 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t ** safe_unpack32(&msg->job_step_id, buffer); safe_unpack32(&msg->pack_ntasks, buffer); safe_unpack32(&msg->pack_offset, buffer); + safe_unpack32(&msg->task_offset, buffer); safe_unpack32(&msg->ntasks, buffer); safe_unpack16(&msg->ntasks_per_board, buffer); safe_unpack16(&msg->ntasks_per_core, buffer); @@ -10647,6 +10649,7 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t ** safe_unpack32(&msg->job_step_id, buffer); msg->pack_ntasks = NO_VAL; msg->pack_offset = NO_VAL; + msg->task_offset = NO_VAL; safe_unpack32(&uint32_tmp, buffer); safe_unpack32(&uint32_tmp, buffer); safe_unpack32(&uint32_tmp, buffer); @@ -10776,6 +10779,7 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t ** safe_unpack32(&msg->job_step_id, buffer); msg->pack_ntasks = NO_VAL; msg->pack_offset = NO_VAL; + msg->task_offset = NO_VAL; safe_unpack32(&msg->ntasks, buffer); safe_unpack16(&msg->ntasks_per_board, buffer); safe_unpack16(&msg->ntasks_per_core, buffer); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index e65b42abc13..0f9c7f51d35 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -1976,10 +1976,10 @@ _wait_for_any_task(stepd_step_rec_t *job, bool waitflag) jobacctinfo_t *jobacct = NULL; struct rusage rusage; char **tmp_env; - uint32_t pack_offset = 0; + uint32_t task_offset = 0; - if (job->pack_offset != NO_VAL) - pack_offset = job->pack_offset; + if (job->task_offset != NO_VAL) + task_offset = job->task_offset; do { pid = wait3(&status, waitflag ? 0 : WNOHANG, &rusage); if (pid == -1) { @@ -2028,7 +2028,7 @@ _wait_for_any_task(stepd_step_rec_t *job, bool waitflag) _log_task_exit(t->gtid, pid, status); t->exited = true; t->estatus = status; - job->envtp->procid = t->gtid + pack_offset; + job->envtp->procid = t->gtid + task_offset; job->envtp->localid = t->id; job->envtp->distribution = -1; job->envtp->batch_flag = job->batch; @@ -2038,7 +2038,8 @@ _wait_for_any_task(stepd_step_rec_t *job, bool waitflag) * place or concurrent searches of the environment can * generate invalid memory references. */ - job->envtp->env = env_array_copy((const char **) job->env); + job->envtp->env = + env_array_copy((const char **) job->env); setup_env(job->envtp, false); tmp_env = job->env; job->env = job->envtp->env; diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index fe5dd3e8e3e..148f8210abb 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -310,6 +310,7 @@ extern stepd_step_rec_t *stepd_step_rec_create(launch_tasks_request_msg_t *msg, job->array_task_id = NO_VAL; job->pack_ntasks = msg->pack_ntasks; /* Used for env vars */ job->pack_offset = msg->pack_offset; /* Used for env vars & labels */ + job->task_offset = msg->task_offset; /* Used for env vars & labels */ for (i = 0; i < msg->envc; i++) { /* 1234567890123456789 */ if (!xstrncmp(msg->env[i], "SLURM_ARRAY_JOB_ID=", 19)) diff --git a/src/slurmd/slurmstepd/task.c b/src/slurmd/slurmstepd/task.c index 2074e291680..f3539ef2c0e 100644 --- a/src/slurmd/slurmstepd/task.c +++ b/src/slurmd/slurmstepd/task.c @@ -343,10 +343,10 @@ exec_task(stepd_step_rec_t *job, int i) stepd_step_task_info_t *task = job->task[i]; char **tmp_env; int saved_errno; - uint32_t pack_offset = 0; + uint32_t task_offset = 0; - if (job->pack_offset != NO_VAL) - pack_offset = job->pack_offset; + if (job->task_offset != NO_VAL) + task_offset = job->task_offset; if (i == 0) _make_tmpdir(job); @@ -360,7 +360,7 @@ exec_task(stepd_step_rec_t *job, int i) job->envtp->stepid = job->stepid; job->envtp->nodeid = job->nodeid; job->envtp->cpus_on_node = job->cpus; - job->envtp->procid = task->gtid + pack_offset; + job->envtp->procid = task->gtid + task_offset; job->envtp->localid = task->id; job->envtp->task_pid = getpid(); job->envtp->distribution = job->task_dist; diff --git a/src/srun/libsrun/srun_job.c b/src/srun/libsrun/srun_job.c index baf19929978..73424a5202e 100644 --- a/src/srun/libsrun/srun_job.c +++ b/src/srun/libsrun/srun_job.c @@ -703,10 +703,10 @@ static int _create_job_step(srun_job_t *job, bool use_all_cpus, fatal("%s: opt_list too short", __func__); job->pack_offset = pack_offset; if (opt.mpi_combine) { - pack_offset++; job->pack_ntasks = pack_ntasks; job->task_offset = task_offset; - } + } else + pack_offset++; rc = create_job_step(job, use_all_cpus, opt_local); if (rc < 0) break; -- GitLab