From 810d649b1c1f6e0290ef400d1a306a5579c6ba15 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Fri, 24 Jun 2005 18:08:30 +0000 Subject: [PATCH] cleaned up env code once again. all env for jobs happens in slurmd/smgr.c setup for env is done within slurmd/mgr.c to new variable in slurmd_job_t *job ->envtp which is an env_t *. All variables are setup there except for when srun runs the script instead of slurmd. For the srun case things are the same as before. --- src/common/env.c | 10 ++-- src/common/env.h | 8 +-- src/slurmd/job.c | 18 +++++-- src/slurmd/job.h | 4 +- src/slurmd/mgr.c | 121 ++++++++++++++-------------------------------- src/slurmd/smgr.c | 43 +++++++--------- src/srun/srun.c | 5 +- 7 files changed, 81 insertions(+), 128 deletions(-) diff --git a/src/common/env.c b/src/common/env.c index dc76d97261f..01558efa9bf 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -225,13 +225,13 @@ int setup_env(env_t *env) return SLURM_ERROR; if (env->nprocs - && setenvf(&env->env, "SLURM_NPROCS", "%u", env->nprocs)) { + && setenvf(&env->env, "SLURM_NPROCS", "%d", env->nprocs)) { error("Unable to set SLURM_NPROCS environment variable"); rc = SLURM_FAILURE; } if (env->cpus_per_task - && setenvf(&env->env, "SLURM_CPUS_PER_TASK", "%u", + && setenvf(&env->env, "SLURM_CPUS_PER_TASK", "%d", env->cpus_per_task) ) { error("Unable to set SLURM_CPUS_PER_TASK"); rc = SLURM_FAILURE; @@ -286,7 +286,7 @@ int setup_env(env_t *env) } if (env->jobid >= 0 - && setenvf(&env->env, "SLURM_JOBID", "%u", env->jobid)) { + && setenvf(&env->env, "SLURM_JOBID", "%d", env->jobid)) { error("Unable to set SLURM_JOBID environment"); rc = SLURM_FAILURE; } @@ -304,7 +304,7 @@ int setup_env(env_t *env) } if (env->stepid >= 0 - && setenvf(&env->env, "SLURM_STEPID", "%u", env->stepid)) { + && setenvf(&env->env, "SLURM_STEPID", "%d", env->stepid)) { error("Unable to set SLURM_STEPID environment"); rc = SLURM_FAILURE; } @@ -316,7 +316,7 @@ int setup_env(env_t *env) } if (env->nhosts - && setenvf(&env->env, "SLURM_NNODES", "%u", env->nhosts)) { + && setenvf(&env->env, "SLURM_NNODES", "%d", env->nhosts)) { error("Unable to set SLURM_NNODES environment var"); rc = SLURM_FAILURE; } diff --git a/src/common/env.h b/src/common/env.h index ea9eb385a79..ac43d09afa9 100644 --- a/src/common/env.h +++ b/src/common/env.h @@ -39,7 +39,6 @@ typedef struct env_options { int nprocs; /* --nprocs=n, -n n */ char *task_count; bool nprocs_set; /* true if nprocs explicitly set */ - int cpus_per_task; /* --cpus-per-task=n, -c n */ bool cpus_set; /* true if cpus_per_task explicitly set */ enum distribution_t distribution; /* --distribution=, -m dist */ @@ -47,16 +46,17 @@ typedef struct env_options { int slurmd_debug; /* --slurmd-debug, -D */ bool labelio; /* --label-output, -l */ select_jobinfo_t select_jobinfo; - uint32_t jobid; /* assigned job id */ - uint32_t stepid; /* assigned step id */ int nhosts; char *nodelist; /* nodelist in string form */ char **env; /* job environment */ slurm_addr *cli; slurm_addr *self; + int jobid; /* assigned job id */ + int stepid; /* assigned step id */ int procid; - int gmpi; int nodeid; + int gmpi; + int cpus_per_task; /* --cpus-per-task=n, -c n */ int cpus_on_node; } env_t; diff --git a/src/slurmd/job.c b/src/slurmd/job.c index 7931910bb8e..b77d03f29ea 100644 --- a/src/slurmd/job.c +++ b/src/slurmd/job.c @@ -177,7 +177,13 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) job->env = _array_copy(msg->envc, msg->env); job->argc = msg->argc; job->argv = _array_copy(job->argc, msg->argv); - + job->envtp = xmalloc(sizeof(env_t)); + job->envtp->jobid = -1; + job->envtp->stepid = -1; + job->envtp->gmpi = -1; + job->envtp->procid = -1; + job->envtp->nodeid = -1; + job->cwd = xstrdup(msg->cwd); memcpy(&resp_addr, cli_addr, sizeof(slurm_addr)); @@ -333,7 +339,13 @@ job_batch_job_create(batch_job_launch_msg_t *msg) job->eio = eio_handle_create(); job->objs = list_create((ListDelF) io_obj_destroy); job->sruns = list_create((ListDelF) _srun_info_destructor); - + job->envtp = xmalloc(sizeof(env_t)); + job->envtp->jobid = -1; + job->envtp->stepid = -1; + job->envtp->gmpi = -1; + job->envtp->procid = -1; + job->envtp->nodeid = -1; + srun = srun_info_create(NULL, NULL, NULL); srun->ofname = _mkfilename(job, msg->out); @@ -437,7 +449,7 @@ job_destroy(slurmd_job_t *job) task_info_destroy(job->task[i]); list_destroy(job->sruns); list_destroy(job->objs); - + xfree(job->envtp); xfree(job); } diff --git a/src/slurmd/job.h b/src/slurmd/job.h index 3c3dfc47b23..9f27d24e277 100644 --- a/src/slurmd/job.h +++ b/src/slurmd/job.h @@ -39,6 +39,7 @@ #include "src/common/list.h" #include "src/common/eio.h" #include "src/common/switch.h" +#include "src/common/env.h" #ifndef MAXHOSTNAMELEN @@ -136,8 +137,7 @@ typedef struct slurmd_job { /* communication between slurmds */ uint16_t task_flags; - slurm_addr *cli; - slurm_addr *self; + env_t *envtp; } slurmd_job_t; diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index 7a5e7854a02..5403e2079f7 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -61,7 +61,6 @@ #include "src/common/node_select.h" #include "src/common/fd.h" #include "src/common/safeopen.h" -#include "src/common/env.h" #include "src/common/slurm_jobacct.h" #include "src/common/switch.h" #include "src/common/xsignal.h" @@ -138,8 +137,6 @@ static int _send_pending_exit_msgs(slurmd_job_t *job); static void _kill_running_tasks(slurmd_job_t *job); static void _setargs(slurmd_job_t *job); -static void _setup_spawn_env(slurmd_job_t *, - slurm_addr *cli, slurm_addr *self); static void _random_sleep(slurmd_job_t *job); static char *_sprint_task_cnt(batch_job_launch_msg_t *msg); @@ -148,8 +145,8 @@ static char *_sprint_task_cnt(batch_job_launch_msg_t *msg); */ static char * _make_batch_dir(slurmd_job_t *job); static char * _make_batch_script(batch_job_launch_msg_t *msg, char *path); -static int _setup_batch_env(slurmd_job_t *job, batch_job_launch_msg_t *msg); -static int _complete_job(uint32_t jobid, uint32_t stepid, int err, int status); +static int _complete_job(uint32_t jobid, uint32_t stepid, + int err, int status); /* SIGHUP (empty) signal handler @@ -164,7 +161,6 @@ mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *cli, slurm_addr *self) { slurmd_job_t *job = NULL; - env_t *env = xmalloc(sizeof(env_t)); if (!(job = job_create(msg, cli))) { _send_launch_failure (msg, cli, errno); @@ -174,12 +170,14 @@ mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *cli, _set_job_log_prefix(job); _setargs(job); - job->cli = cli; - job->self = self; - if (_job_mgr(job) < 0) + job->envtp->cli = cli; + job->envtp->self = self; + + if (_job_mgr(job) < 0) return SLURM_ERROR; - + + job_destroy(job); return SLURM_SUCCESS; @@ -194,9 +192,15 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli) int rc = 0; int status = 0; uint32_t jobid = msg->job_id; - slurmd_job_t *job; - char *batchdir; - + slurmd_job_t *job = NULL; + char *batchdir = NULL; + char buf[1024]; + hostlist_t hl = hostlist_create(msg->nodes); + if (!hl) + return SLURM_ERROR; + + hostlist_ranged_string(hl, 1024, buf); + if (!(job = job_batch_job_create(msg))) { /* * Set "job" status to returned errno and cleanup job. @@ -204,7 +208,7 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli) status = errno; goto cleanup; } - + _set_job_log_prefix(job); _setargs(job); @@ -216,11 +220,16 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli) if ((job->argv[0] = _make_batch_script(msg, batchdir)) == NULL) goto cleanup2; - if ((rc = _setup_batch_env(job, msg)) < 0) - goto cleanup2; - + + job->envtp->nprocs = msg->nprocs; + job->envtp->select_jobinfo = msg->select_jobinfo; + job->envtp->nhosts = hostlist_count(hl); + hostlist_destroy(hl); + job->envtp->nodelist = buf; + job->envtp->task_count = _sprint_task_cnt(msg); + status = _job_mgr(job); - + cleanup2: if (job->argv[0] && (unlink(job->argv[0]) < 0)) error("unlink(%s): %m", job->argv[0]); @@ -247,7 +256,7 @@ mgr_spawn_task(spawn_task_request_msg_t *msg, slurm_addr *cli, slurm_addr *self) { slurmd_job_t *job = NULL; - + if (!(job = job_spawn_create(msg, cli))) return SLURM_ERROR; @@ -256,12 +265,12 @@ mgr_spawn_task(spawn_task_request_msg_t *msg, slurm_addr *cli, _setargs(job); - job->cli = cli; - job->self = self; + job->envtp->cli = cli; + job->envtp->self = self; - if (_job_mgr(job) < 0) + if (_job_mgr(job) < 0) return SLURM_ERROR; - + job_destroy(job); return SLURM_SUCCESS; @@ -478,9 +487,10 @@ static int _job_mgr(slurmd_job_t *job) { int rc = 0; - + debug3("Entered job_mgr pid=%lu", (unsigned long) getpid()); + if (shm_init(false) < 0) goto fail0; @@ -497,7 +507,7 @@ _job_mgr(slurmd_job_t *job) rc = ESLURM_INTERCONNECT_FAILURE; goto fail1; } - + xsignal_block(mgr_sigarray); xsignal(SIGHUP, _hup_handler); @@ -1030,42 +1040,6 @@ _make_batch_script(batch_job_launch_msg_t *msg, char *path) } -static int -_setup_batch_env(slurmd_job_t *job, batch_job_launch_msg_t *msg) -{ - char buf[1024], *task_buf, *bgl_part_id = NULL; - struct utsname name; - hostlist_t hl = hostlist_create(msg->nodes); - env_t *env = xmalloc(sizeof(env_t)); - - if (!hl) - return SLURM_ERROR; - - hostlist_ranged_string(hl, 1024, buf); - - env->stepid = -1; - env->gmpi = -1; - env->procid = -1; - env->nodeid = -1; - env->nprocs = msg->nprocs; - env->select_jobinfo = msg->select_jobinfo; - env->jobid = job->jobid; - env->nhosts = hostlist_count(hl); - hostlist_destroy(hl); - env->nodelist = buf; - env->task_count = _sprint_task_cnt(msg); - env->env = job->env; - - setup_env(env); - job->env = env->env; - env->env = NULL; - xfree(env->task_count); - xfree(env); - - return 0; - -} - static char * _sprint_task_cnt(batch_job_launch_msg_t *msg) { @@ -1301,28 +1275,3 @@ _setargs(slurmd_job_t *job) return; } - -static void -_setup_spawn_env(slurmd_job_t *job, slurm_addr *cli, slurm_addr *self) -{ - env_t *env = xmalloc(sizeof(env_t)); - - env->stepid = -1; - env->gmpi = -1; - env->procid = -1; - env->nodeid = -1; - env->jobid = -1; - - env->cli = cli; - env->self = self; - env->jobid = job->jobid; - env->stepid = job->stepid; - env->env = job->env; - - setup_env(env); - job->env = env->env; - env->env = NULL; - xfree(env); - return; -} - diff --git a/src/slurmd/smgr.c b/src/slurmd/smgr.c index 1f74a561eb9..2544bbadb7c 100644 --- a/src/slurmd/smgr.c +++ b/src/slurmd/smgr.c @@ -58,7 +58,6 @@ #include "src/common/fd.h" #include "src/common/log.h" #include "src/common/slurm_jobacct.h" -#include "src/common/env.h" #include "src/common/switch.h" #include "src/common/xsignal.h" @@ -293,7 +292,7 @@ _exec_all_tasks(slurmd_job_t *job) xassert(job != NULL); xassert(fd >= 0); - + /* * Block signals for this process before exec-ing * user tasks. Esp. important to block SIGCHLD until @@ -371,18 +370,27 @@ static void _exec_task(slurmd_job_t *job, int i) { task_info_t *t = NULL; - env_t *env = xmalloc(sizeof(env_t)); - env->stepid = -1; - env->gmpi = -1; - env->procid = -1; - env->nodeid = -1; - env->jobid = -1; - if (xsignal_unblock(smgr_sigarray) < 0) { error("unable to unblock signals"); exit(1); } + job->envtp->jobid = job->jobid; + job->envtp->stepid = job->stepid; + job->envtp->nodeid = job->nodeid; + job->envtp->cpus_on_node = job->cpus; + job->envtp->env = job->env; + + t = job->task[i]; + job->envtp->procid = t->gtid; + job->envtp->gmpi = t->gtid; + + + setup_env(job->envtp); + job->env = job->envtp->env; + job->envtp->env = NULL; + xfree(job->envtp->task_count); + if (!job->batch) { if (interconnect_attach(job->switch_job, &job->env, job->nodeid, (uint32_t) i, job->nnodes, @@ -391,23 +399,6 @@ _exec_task(slurmd_job_t *job, int i) exit(1); } - t = job->task[i]; - - env->jobid = job->jobid; - env->stepid = job->stepid; - env->nodeid = job->nodeid; - env->cpus_on_node = job->cpus; - env->cli = job->cli; - env->self = job->self; - env->procid = t->gtid; - env->gmpi = t->gtid; - env->env = job->env; - - setup_env(env); - job->env = env->env; - env->env = NULL; - xfree(env); - _pdebug_stop_current(job); } diff --git a/src/srun/srun.c b/src/srun/srun.c index 8e1c95ba5d0..d76fe141797 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -204,6 +204,7 @@ int srun(int ac, char **av) job_destroy(job,exitcode); debug ("Spawned srun shell terminated"); + xfree(env->task_count); xfree(env); exit (exitcode); @@ -237,7 +238,6 @@ int srun(int ac, char **av) /* * Enhance environment for job */ - env->nprocs = opt.nprocs; env->cpus_per_task = opt.cpus_per_task; env->distribution = opt.distribution; @@ -246,14 +246,15 @@ int srun(int ac, char **av) env->labelio = opt.labelio; if(job) { env->select_jobinfo = job->select_jobinfo; - env->jobid = job->jobid; env->nhosts = job->nhosts; env->nodelist = job->nodelist; env->task_count = _task_count_string (job); } setup_env(env); + xfree(env->task_count); xfree(env); + if (slurm_get_mpich_gm_dir() && getenv("GMPI_PORT") == NULL) { /* * It is possible for one to modify the mpirun command in -- GitLab