diff --git a/NEWS b/NEWS index eafff180d584888c9ebea217acf829e92dfb2fe9..8882957ddf1073d73328e8e0f824210823466408 100644 --- a/NEWS +++ b/NEWS @@ -172,6 +172,9 @@ documents those changes that are of interest to users and admins. specifying an account of the form "qos-name". -- In select/linear, fix bug in scheduling required nodes that already have a job running on them (req.load.patch from Chris Holmes, HP). + -- Change timeout for srun/sbatch --get-user-env option to 2 secs, don't get + DISPLAY environment variables, but explicitly set ENVIRONMENT=BATCH and + HOSTNAME to the execution host of the batch script. * Changes in SLURM 1.2.22 ========================= diff --git a/src/common/env.c b/src/common/env.c index eb7e84f900fdade630cf732926a7f01b61fcb335..13251064d481f7c895bb4c95dbeb24a2554fb165 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -75,7 +75,7 @@ strong_alias(env_array_append_fmt, slurm_env_array_append_fmt); strong_alias(env_array_overwrite, slurm_env_array_overwrite); strong_alias(env_array_overwrite_fmt, slurm_env_array_overwrite_fmt); -#define SU_WAIT_MSEC 8000 /* 8000 msec for /bin/su to return user +#define SU_WAIT_MSEC 2000 /* 2 sec default for /bin/su to return user * env vars for --get-user-env option */ #define ENV_BUFSIZE (256 * 1024) @@ -132,6 +132,18 @@ _extend_env(char ***envp) return (++ep); } +/* return true if the environment variables should not be set for + * srun's --get-user-env option */ +static bool _discard_env(char *name, char *value) +{ + if ((strcmp(name, "DISPLAY") == 0) || + (strcmp(name, "ENVIRONMENT") == 0) || + (strcmp(name, "HOSTNAME") == 0)) + return true; + + return false; +} + /* * Return the number of elements in the environment `env' */ @@ -807,6 +819,7 @@ env_array_for_job(char ***dest, const resource_allocation_response_msg_t *alloc) * SLURM_JOB_NODELIST * SLURM_JOB_CPUS_PER_NODE * ENVIRONMENT=BATCH + * HOSTNAME * LOADLBATCH (AIX only) * * Sets OBSOLETE variables: @@ -816,8 +829,9 @@ env_array_for_job(char ***dest, const resource_allocation_response_msg_t *alloc) * SLURM_TASKS_PER_NODE <- poorly named, really CPUs per node * ? probably only needed for users... */ -void -env_array_for_batch_job(char ***dest, const batch_job_launch_msg_t *batch) +extern void +env_array_for_batch_job(char ***dest, const batch_job_launch_msg_t *batch, + const char *node_name) { char *tmp; uint32_t num_nodes = 0; @@ -837,6 +851,8 @@ env_array_for_batch_job(char ***dest, const batch_job_launch_msg_t *batch) batch->cpu_count_reps); env_array_overwrite_fmt(dest, "SLURM_JOB_CPUS_PER_NODE", "%s", tmp); env_array_overwrite_fmt(dest, "ENVIRONMENT", "BATCH"); + if (node_name) + env_array_overwrite_fmt(dest, "HOSTNAME", "%s", node_name); #ifdef HAVE_AIX /* this puts the "poe" command into batch mode */ env_array_overwrite(dest, "LOADLBATCH", "yes"); @@ -1274,8 +1290,9 @@ char **_load_env_cache(const char *username) if (!fgets(line, sizeof(line), fp)) break; _strip_cr_nl(line); - if (_env_array_entry_splitter(line, name, sizeof(name), - value, sizeof(value))) + if (_env_array_entry_splitter(line, name, sizeof(name), + value, sizeof(value)) && + (!_discard_env(name, value))) env_array_overwrite(&env, name, value); } fclose(fp); @@ -1291,7 +1308,7 @@ char **_load_env_cache(const char *username) * 2. Load the user environment from a cache file. This is used * in the event that option 1 times out. * - * timeout value is in seconds or zero for default (8 secs) + * timeout value is in seconds or zero for default (2 secs) * mode is 1 for short ("su <user>"), 2 for long ("su - <user>") * On error, returns NULL. * @@ -1313,7 +1330,7 @@ char **env_array_user_default(const char *username, int timeout, int mode) struct pollfd ufds; if (geteuid() != (uid_t)0) { - info("WARNING: you must be root to use --get-user-env"); + fatal("WARNING: you must be root to use --get-user-env"); return NULL; } @@ -1414,7 +1431,7 @@ char **env_array_user_default(const char *username, int timeout, int mode) close(fildes[0]); if (!found) { error("Failed to load current user environment variables"); - _load_env_cache(username); + return _load_env_cache(username); } /* First look for the start token in the output */ @@ -1444,12 +1461,14 @@ char **env_array_user_default(const char *username, int timeout, int mode) break; } if (_env_array_entry_splitter(line, name, sizeof(name), - value, sizeof(value))) + value, sizeof(value)) && + (!_discard_env(name, value))) env_array_overwrite(&env, name, value); line = strtok_r(NULL, "\n", &last); } if (!found) { error("Failed to get all user environment variables"); + env_array_free(env); return _load_env_cache(username); } diff --git a/src/common/env.h b/src/common/env.h index e4fe65b968d14f4565b8a723f8c0933b650ea5a0..4e3c4acfd8b88f6d51f898a3c6dfb9e2bae7b324 100644 --- a/src/common/env.h +++ b/src/common/env.h @@ -120,6 +120,7 @@ void env_array_for_job(char ***dest, * SLURM_JOB_NODELIST * SLURM_JOB_CPUS_PER_NODE * ENVIRONMENT=BATCH + * HOSTNAME * LOADLBATCH (AIX only) * * Sets OBSOLETE variables: @@ -129,7 +130,9 @@ void env_array_for_job(char ***dest, * SLURM_TASKS_PER_NODE <- poorly named, really CPUs per node * ? probably only needed for users... */ -void env_array_for_batch_job(char ***dest, const batch_job_launch_msg_t *batch); +extern void env_array_for_batch_job(char ***dest, + const batch_job_launch_msg_t *batch, + const char* node_name); /* * Set in "dest the environment variables relevant to a SLURM job step, diff --git a/src/plugins/mpi/mvapich/mvapich.c b/src/plugins/mpi/mvapich/mvapich.c index ac07bedab2c97337be5d166b0b1a3cf2c8124b8a..4732a6d9f575a6e5822b60b0f21dcd86630789fa 100644 --- a/src/plugins/mpi/mvapich/mvapich.c +++ b/src/plugins/mpi/mvapich/mvapich.c @@ -320,8 +320,12 @@ static int mvapich_read_n (mvapich_state_t *st, struct mvapich_info *mvi, return (-1); } - if (n == 0) /* unexpected EOF */ + if (n == 0) { /* unexpected EOF */ + error ("mvapich: rank %d: " + "Unexpected EOF (%dB left to read)", + mvi->rank, nleft); return (-1); + } nleft -= n; p += n; @@ -554,52 +558,55 @@ static void mvapich_bcast_hostids (mvapich_state_t *st) } /* Write size bytes from buf into socket for rank */ -static void mvapich_send (mvapich_state_t *st, void* buf, int size, int rank) +static int mvapich_send (mvapich_state_t *st, void* buf, int size, int rank) { struct mvapich_info *mvi = st->mvarray [rank]; - if (mvapich_write_n (st, mvi, buf, size) < 0) - error ("mvapich: write hostid rank %d: %m", mvi->rank); + return (mvapich_write_n (st, mvi, buf, size)); } /* Read size bytes from socket for rank into buf */ -static void mvapich_recv (mvapich_state_t *st, void* buf, int size, int rank) +static int mvapich_recv (mvapich_state_t *st, void* buf, int size, int rank) { struct mvapich_info *mvi = st->mvarray [rank]; - if (mvapich_read_n (st, mvi, buf, size) <= 0) - error("mvapich reading from %d: %m", mvi->rank); -} - -/* Read an integer from socket for rank */ -static int mvapich_recv_int (mvapich_state_t *st, int rank) -{ - int buf; - mvapich_recv(st, &buf, sizeof(buf), rank); - return buf; + return (mvapich_read_n (st, mvi, buf, size)); } /* Scatter data in buf to ranks using chunks of size bytes */ -static void mvapich_scatterbcast (mvapich_state_t *st, void* buf, int size) +static int mvapich_scatterbcast (mvapich_state_t *st, void* buf, int size) { - int i; - for (i = 0; i < st->nprocs; i++) - mvapich_send(st, buf + i*size, size, i); + int i, rc; + int n = 0; + + for (i = 0; i < st->nprocs; i++) { + if ((rc = mvapich_send (st, buf + i*size, size, i)) <= 0) + return (-1); + n += rc; + } + return (n); } /* Broadcast buf to each rank, which is size bytes big */ -static void mvapich_allgatherbcast (mvapich_state_t *st, void* buf, int size) +static int mvapich_allgatherbcast (mvapich_state_t *st, void* buf, int size) { - int i; - for (i = 0; i < st->nprocs; i++) - mvapich_send(st, buf, size, i); + int i, rc; + int n = 0; + + for (i = 0; i < st->nprocs; i++) { + if ((rc = mvapich_send (st, buf, size, i)) <= 0) + return (-1); + n += rc; + } + return (n); } /* Perform alltoall using data in buf with elements of size bytes */ -static void mvapich_alltoallbcast (mvapich_state_t *st, void* buf, int size) +static int mvapich_alltoallbcast (mvapich_state_t *st, void* buf, int size) { int pbufsize = size * st->nprocs; void* pbuf = xmalloc(pbufsize); + int i, src, rc; + int n = 0; - int i, src; for (i = 0; i < st->nprocs; i++) { for (src = 0; src < st->nprocs; src++) { memcpy( pbuf + size*src, @@ -607,22 +614,141 @@ static void mvapich_alltoallbcast (mvapich_state_t *st, void* buf, int size) size ); } - mvapich_send(st, pbuf, pbufsize, i); + if ((rc = mvapich_send (st, pbuf, pbufsize, i)) <= 0) + goto out; + n += rc; } + out: xfree(pbuf); + return (rc < 0 ? rc : n); +} + +static int recv_common_value (mvapich_state_t *st, int *valp, int rank) +{ + int val; + if (mvapich_recv (st, &val, sizeof (int), rank) <= 0) { + error ("mvapich: recv: rank %d: %m\n", rank); + return (-1); + } + + /* + * If value is uninitialized, set it to current value, + * otherwise ensure that current value matches previous + */ + if (*valp == -1) + *valp = val; + else if (val != *valp) { + error ("mvapich: PMGR: unexpected value from rank %d: " + "expected %d, recvd %d", rank, *valp, val); + return (-1); + } + return (0); +} + +/* + * PMGR_BCAST (root, size of message, then message data (from root only)) + */ +static int process_pmgr_bcast (mvapich_state_t *st, int *rootp, int *sizep, + void ** bufp, int rank) +{ + if (recv_common_value (st, rootp, rank) < 0) + return (-1); + if (recv_common_value (st, sizep, rank) < 0) + return (-1); + if (rank != *rootp) + return (0); + + /* + * Recv data from root + */ + *bufp = xmalloc (*sizep); + if (mvapich_recv (st, *bufp, *sizep, rank) < 0) { + error ("mvapich: PMGR_BCAST: Failed to recv from root: %m"); + return (-1); + } + return (0); +} + +/* + * PMGR_GATHER (root, size of message, then message data) + */ +static int process_pmgr_gather (mvapich_state_t *st, int *rootp, + int *sizep, void **bufp, int rank) +{ + if (recv_common_value (st, rootp, rank) < 0) + return (-1); + if (recv_common_value (st, sizep, rank) < 0) + return (-1); + if (*bufp == NULL) + *bufp = xmalloc (*sizep * st->nprocs); + + if (mvapich_recv(st, (*bufp) + (*sizep)*rank, *sizep, rank) < 0) { + error ("mvapich: PMGR_/GATHER: rank %d: recv: %m", rank); + return (-1); + } + return (0); } -/* Check that new == curr value if curr has been initialized */ -static int set_current (int curr, int new) +/* + * PMGR_SCATTER (root, size of message, then message data) + */ +static int process_pmgr_scatter (mvapich_state_t *st, int *rootp, + int *sizep, void **bufp, int rank) { - if (curr == -1) - curr = new; - if (new != curr) { - error("PMGR unexpected value: received %d, expecting %d", - new, curr); + if (recv_common_value (st, rootp, rank) < 0) + return (-1); + if (recv_common_value (st, sizep, rank) < 0) + return (-1); + if (rank != *rootp) + return (0); + + if (*bufp == NULL) + *bufp = xmalloc (*sizep * st->nprocs); + + if (mvapich_recv(st, *bufp, (*sizep) * st->nprocs, rank) < 0) { + error ("mvapich: PMGR_SCATTER: rank %d: recv: %m", rank); + return (-1); } - return curr; + return (0); +} + +/* + * PMGR_ALLGATHER (size of message, then message data) + */ +static int process_pmgr_allgather (mvapich_state_t *st, int *sizep, + void **bufp, int rank) +{ + if (recv_common_value (st, sizep, rank) < 0) + return (-1); + if (*bufp == NULL) + *bufp = xmalloc (*sizep * st->nprocs); + if (mvapich_recv (st, (*bufp) + *sizep*rank, *sizep, rank) < 0) { + error ("mvapich: PMGR_ALLGATHER: rank %d: %m", rank); + return (-1); + } + return (0); +} + +/* + * PMGR_ALLTOALL (size of message, then message data) + */ +static int process_pmgr_alltoall (mvapich_state_t *st, int *sizep, + void **bufp, int rank) +{ + if (recv_common_value (st, sizep, rank) < 0) + return (-1); + + if (*bufp == NULL) + *bufp = xmalloc (*sizep * st->nprocs * st->nprocs); + if (mvapich_recv ( st, + *bufp + (*sizep * st->nprocs)*rank, + *sizep * st->nprocs, rank ) < 0) { + error ("mvapich: PMGR_ALLTOALL: recv: rank %d: %m", rank); + return (-1); + } + + return (0); } /* @@ -643,7 +769,7 @@ static int set_current (int curr, int new) * Note: Although there are op codes available for PMGR_OPEN and * PMGR_ABORT, neither is fully implemented and should not be used. */ -static void mvapich_processops (mvapich_state_t *st) +static int mvapich_processops (mvapich_state_t *st) { /* Until a 'CLOSE' or 'ABORT' message is seen, we continuously * loop processing ops @@ -663,57 +789,57 @@ static void mvapich_processops (mvapich_state_t *st) struct mvapich_info *mvi = st->mvarray [i]; // read in opcode - opcode = set_current(opcode, mvapich_recv_int(st, i)); + if (recv_common_value (st, &opcode, i) < 0) { + error ("mvapich: rank %d: Failed to read opcode: %m", + mvi->rank); + return (-1); + } // read in additional data depending on current opcode int rank, code; switch(opcode) { case 0: // PMGR_OPEN (followed by rank) - rank = mvapich_recv_int(st, i); + if (mvapich_recv (st, &rank, sizeof (int), i) <= 0) { + error ("mvapich: PMGR_OPEN: recv: %m"); + exit = 1; + } break; case 1: // PMGR_CLOSE (no data, close the socket) close(mvi->fd); break; case 2: // PMGR_ABORT (followed by exit code) - code = mvapich_recv_int(st, i); + if (mvapich_recv (st, &code, sizeof (int), i) <= 0) { + error ("mvapich: PMGR_ABORT: recv: %m"); + } error("mvapich abort with code %d from rank %d", code, i); break; case 3: // PMGR_BARRIER (no data) break; - case 4: // PMGR_BCAST (root, size of message, - // then message data (from root only)) - root = set_current(root, mvapich_recv_int(st, i)); - size = set_current(size, mvapich_recv_int(st, i)); - if (!buf) buf = (void*) xmalloc(size); - if (i == root) mvapich_recv(st, buf, size, i); + case 4: // PMGR_BCAST + if (process_pmgr_bcast (st, &root, &size, &buf, i) < 0) + return (-1); break; - case 5: // PMGR_GATHER (root, size of message, - // then message data) - root = set_current(root, mvapich_recv_int(st, i)); - size = set_current(size, mvapich_recv_int(st, i)); - if (!buf) buf = (void*) xmalloc(size * st->nprocs); - mvapich_recv(st, buf + size*i, size, i); + case 5: // PMGR_GATHER + if (process_pmgr_gather (st, &root, &size, &buf, i) < 0) + return (-1); break; - case 6: // PMGR_SCATTER (root, size of message, - // then message data) - root = set_current(root, mvapich_recv_int(st, i)); - size = set_current(size, mvapich_recv_int(st, i)); - if (!buf) buf = (void*) xmalloc(size * st->nprocs); - if (i == root) mvapich_recv(st, buf, size * st->nprocs, i); + case 6: // PMGR_SCATTER + if (process_pmgr_scatter (st, &root, + &size, &buf, i) < 0) + return (-1); break; - case 7: // PMGR_ALLGATHER (size of message, then message data) - size = set_current(size, mvapich_recv_int(st, i)); - if (!buf) buf = (void*) xmalloc(size * st->nprocs); - mvapich_recv(st, buf + size*i, size, i); + case 7: // PMGR_ALLGATHER + if (process_pmgr_allgather (st, &size, &buf, i) < 0) + return (-1); break; - case 8: // PMGR_ALLTOALL (size of message, then message data) - size = set_current(size, mvapich_recv_int(st, i)); - if (!buf) buf = (void*) xmalloc(size * st->nprocs * st->nprocs); - mvapich_recv(st, buf + (size*st->nprocs)*i, size * st->nprocs, i); + case 8: // PMGR_ALLTOALL + if (process_pmgr_alltoall (st, &size, &buf, i) < 0) + return (-1); break; default: error("Unrecognized PMGR opcode: %d", opcode); + return (-1); } } @@ -767,6 +893,7 @@ static void mvapich_processops (mvapich_state_t *st) xfree(buf); } // while(!exit) mvapich_debug ("Completed processing PMGR opcodes"); + return (0); } static void mvapich_bcast (mvapich_state_t *st) @@ -1125,6 +1252,7 @@ static void *mvapich_thr(void *arg) int first = 1; debug ("mvapich-0.9.x/gen2: thread started: %ld", pthread_self ()); + info ("mvapich debug version"); mvapich_mvarray_create (st); @@ -1158,7 +1286,8 @@ again: } if (st->protocol_version == 8) { - mvapich_processops(st); + if (mvapich_processops(st) < 0) + goto fail; } else { mvapich_debug ("bcasting mvapich info to %d tasks", st->nprocs); mvapich_bcast (st); diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index 011caf8633b389904f4673bbdf0cc2ae84e489cb..c1e103911f155f2a9b6c7bbec857578ba608cdae 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -261,11 +261,12 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc) struct passwd *pw = NULL; pw = getpwuid(opt.uid); if (pw != NULL) { - desc->environment = env_array_user_default(pw->pw_name, + desc->environment = env_array_user_default( + pw->pw_name, opt.get_user_env_time, opt.get_user_env_mode); - /* FIXME - should we abort if j->environment - * is NULL? */ + if (desc->environment == NULL) + exit(1); /* error already logged */ } } env_array_merge(&desc->environment, (const char **)environ); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 12f3024e93a04461a35866e1974d21b596961ea7..dcc4a20582465d8a2535b83ed6aadd5177d1bdaf 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -280,7 +280,7 @@ mgr_launch_batch_job_setup(batch_job_launch_msg_t *msg, slurm_addr *cli) } /* this is the new way of setting environment variables */ - env_array_for_batch_job(&job->env, msg); + env_array_for_batch_job(&job->env, msg, conf->node_name); /* this is the old way of setting environment variables */ job->envtp->nprocs = msg->nprocs;