diff --git a/NEWS b/NEWS index 177f780d36d60d7935609b9cf4d359e6f9bde11b..a1e8585776e482f4ac1c6edc43ec788b341ff0f0 100644 --- a/NEWS +++ b/NEWS @@ -182,6 +182,8 @@ documents those changes that are of interest to users and admins. -- Bug fix for setting exit code in accounting for batch script. -- Add salloc option, --no-shell (for LSF). -- Added new options for sacct output + -- mvapich: Ensure MPIRUN_ID is unique for all job steps within a job. + (Fixes crashes when running multiple job steps within a job on one node) * Changes in SLURM 1.2.24 ========================= diff --git a/src/plugins/mpi/mvapich/mvapich.c b/src/plugins/mpi/mvapich/mvapich.c index 38f1f5d8ecab0bd3c67861158089aa08f123ca6e..b2e5b868484b3c3867e4e29a0bcb7ef27a39ae68 100644 --- a/src/plugins/mpi/mvapich/mvapich.c +++ b/src/plugins/mpi/mvapich/mvapich.c @@ -1366,6 +1366,15 @@ static void mvapich_state_destroy(mvapich_state_t *st) xfree(st); } +/* + * Create a unique MPIRUN_ID for jobid/stepid pairs. + * Combine the least significant bits of the jobid and stepid + */ +int mpirun_id_create(const mpi_plugin_client_info_t *job) +{ + return (int) ((job->jobid << 16) | (job->stepid & 0xffff)); +} + extern mvapich_state_t *mvapich_thr_create(const mpi_plugin_client_info_t *job, char ***env) { @@ -1405,12 +1414,12 @@ extern mvapich_state_t *mvapich_thr_create(const mpi_plugin_client_info_t *job, */ env_array_overwrite_fmt(env, "MPIRUN_PORT", "%hu", port); env_array_overwrite_fmt(env, "MPIRUN_NPROCS", "%d", st->nprocs); - env_array_overwrite_fmt(env, "MPIRUN_ID", "%d", st->job->jobid); + env_array_overwrite_fmt(env, "MPIRUN_ID", "%d", mpirun_id_create(job)); if (st->connect_once) { env_array_overwrite_fmt(env, "MPIRUN_CONNECT_ONCE", "1"); } - verbose ("mvapich-0.9.[45] master listening on port %d", port); + verbose ("mvapich-0.9.[45] master listening on port %hu", port); return st; }