diff --git a/NEWS b/NEWS index 878d3fc549accc6def458f6532bd6c2c0e2a3761..2aa7ac63a99bd30fbc4bb444cc950dacdee35887 100644 --- a/NEWS +++ b/NEWS @@ -177,6 +177,7 @@ documents those changes that are of interest to users and administrators. -- Cray - add NHC_ABSOLUTELY_NO to never run NHC, even on certain edge cases that it would otherwise be run on with NHC_NO. -- Ignore GRES/QOS updates that maintain the same value as before. + -- mpi/pmix - prepare temp directory for application. * Changes in Slurm 16.05.4 ========================== diff --git a/src/plugins/mpi/pmix/pmixp_client.c b/src/plugins/mpi/pmix/pmixp_client.c index d2bbd11e6f29bdd508f3b70399ac12cd988b571f..d78772d88baff260cef107386e6ef0bd91cdb493 100644 --- a/src/plugins/mpi/pmix/pmixp_client.c +++ b/src/plugins/mpi/pmix/pmixp_client.c @@ -59,6 +59,11 @@ #pragma message "PMIx version mismatch: the major version seen during configuration was " VALUE(HAVE_PMIX_VER) "L but found " VALUE(PMIX_VERSION_MAJOR) " compilation will most likely fail. Please reconfigure against the new version." #endif +// define some additional keys +#ifndef PMIX_TDIR_RMCLEAN +#define PMIX_TDIR_RMCLEAN "pmix.tdir.rmclean" +#endif + #define PMIXP_ALLOC_KEY(kvp, key_str) \ { \ char *key = key_str; \ @@ -217,31 +222,18 @@ int pmixp_libpmix_init(void) mode_t rights = (S_IRUSR | S_IWUSR | S_IXUSR) | (S_IRGRP | S_IWGRP | S_IXGRP); pmix_info_t *kvp = NULL; - /* NOTE: we need user who owns the job to access PMIx usock - * file. According to 'man 7 unix': - * "... In the Linux implementation, sockets which are visible in the file system - * honor the permissions of the directory they are in... " - * Our case is the following: slurmstepd is usually running as root, user application will - * be "sudo'ed". To provide both of them with acces to the unix socket we do the following: - * 1. Owner ID is set to the job owner. - * 2. Group ID corresponds to slurmstepd. - * 3. Set 0770 access mode */ - if (0 != mkdir(pmixp_info_tmpdir_lib(), rights) ) { - PMIXP_ERROR_STD("Cannot create directory \"%s\"", pmixp_info_tmpdir_lib()); + if (0 != (rc = pmixp_mkdir(pmixp_info_tmpdir_lib(), rights))) { + PMIXP_ERROR_STD("Cannot create server tmpdir: \"%s\"", + pmixp_info_tmpdir_lib()); return errno; } - /* There might be umask that will drop essential rights. Fix it explicitly. - * TODO: is there more elegant solution? */ - if (chmod(pmixp_info_tmpdir_lib(), rights) < 0) { - error("chown(%s): %m", pmixp_info_tmpdir_lib()); + if (0 != (rc = pmixp_mkdir(pmixp_info_tmpdir_cli(), rights))) { + PMIXP_ERROR_STD("Cannot create client tmpdir: \"%s\"", + pmixp_info_tmpdir_cli()); return errno; } - if (chown(pmixp_info_tmpdir_lib(), (uid_t) pmixp_info_jobuid(), (gid_t) -1) < 0) { - error("chown(%s): %m", pmixp_info_tmpdir_lib()); - return errno; - } /* TODO: must be deleted in future once info-key approach will harden */ setenv(PMIXP_PMIXLIB_TMPDIR, pmixp_info_tmpdir_lib(), 1); @@ -296,10 +288,17 @@ int pmixp_libpmix_finalize(void) } rc1 = pmixp_rmdir_recursively(pmixp_info_tmpdir_lib()); - if (0 == rc) { - /* return only one error :) */ - rc = rc1; + if (0 != rc1) { + PMIXP_ERROR_STD("Failed to remove %s\n", pmixp_info_tmpdir_lib()); + /* Not considering this as fatal error */ } + + rc1 = pmixp_rmdir_recursively(pmixp_info_tmpdir_cli()); + if (0 != rc1) { + PMIXP_ERROR_STD("Failed to remove %s\n", pmixp_info_tmpdir_cli()); + /* Not considering this as fatal error */ + } + return rc; } @@ -362,27 +361,29 @@ static void _set_tmpdirs(List lresp) { pmix_info_t *kvp; char *p = NULL; + bool rmclean = true; /* We consider two sources of the tempdir: * - SLURM's slurm.conf TmpFS option; * - env var SLURM_PMIX_TMPDIR; * do we need to do anything else? */ - p = pmixp_info_tmpdir_cli(); - if (NULL == p) { - p = PMIXP_TMPDIR_DEFAULT; - } + p = pmixp_info_tmpdir_cli_base(); PMIXP_ALLOC_KEY(kvp, PMIX_TMPDIR); PMIX_VAL_SET(&kvp->value, string, p); list_append(lresp, kvp); + + p = pmixp_info_tmpdir_cli(); PMIXP_ALLOC_KEY(kvp, PMIX_NSDIR); PMIX_VAL_SET(&kvp->value, string, p); list_append(lresp, kvp); - PMIXP_ALLOC_KEY(kvp, PMIX_PROCDIR); - PMIX_VAL_SET(&kvp->value, string, p); + + PMIXP_ALLOC_KEY(kvp, PMIX_TDIR_RMCLEAN); + PMIX_VAL_SET(&kvp->value, flag, rmclean); list_append(lresp, kvp); } + /* * information about relative ranks as assigned by the RM */ diff --git a/src/plugins/mpi/pmix/pmixp_info.c b/src/plugins/mpi/pmix/pmixp_info.c index d2261c5d41e7313166819670180e5fb7f144e3fd..ed81578ef991dacd1f1667b56c756b24352a6326 100644 --- a/src/plugins/mpi/pmix/pmixp_info.c +++ b/src/plugins/mpi/pmix/pmixp_info.c @@ -299,13 +299,18 @@ static int _env_set(char ***env) */ p = getenvp(*env, PMIXP_TMPDIR_CLI); if (NULL != p) { - _pmixp_job_info.cli_tmpdir = xstrdup(p); + _pmixp_job_info.cli_tmpdir_base = xstrdup(p); } else { p = slurm_get_tmp_fs(); if (NULL != p) { - _pmixp_job_info.cli_tmpdir = p; + _pmixp_job_info.cli_tmpdir_base = p; } } + _pmixp_job_info.cli_tmpdir = + xstrdup_printf("%s/spmix_appdir_%d.%d", + _pmixp_job_info.cli_tmpdir_base, + pmixp_info_jobid(), pmixp_info_stepid()); + /* ----------- Timeout setting ------------- */ /* TODO: also would be nice to have a cluster-wide setting in SLURM */ diff --git a/src/plugins/mpi/pmix/pmixp_info.h b/src/plugins/mpi/pmix/pmixp_info.h index b51fec493823dcd0d51325d323ed147de48c5d84..5fe7620adda6b4b07a88fedfe620322ce1ec9441 100644 --- a/src/plugins/mpi/pmix/pmixp_info.h +++ b/src/plugins/mpi/pmix/pmixp_info.h @@ -67,7 +67,7 @@ typedef struct { uint32_t *gtids; /* global ids of tasks located on *this* node */ char *task_map_packed; /* string represents packed task mapping information */ int timeout; - char *cli_tmpdir; + char *cli_tmpdir, *cli_tmpdir_base; char *lib_tmpdir; uid_t uid; gid_t gid; @@ -98,6 +98,11 @@ static inline char *pmixp_info_tmpdir_cli(void) return _pmixp_job_info.cli_tmpdir; } +static inline char *pmixp_info_tmpdir_cli_base(void) +{ + return _pmixp_job_info.cli_tmpdir_base; +} + /* Cli tempdir */ static inline char *pmixp_info_tmpdir_lib(void) { diff --git a/src/plugins/mpi/pmix/pmixp_utils.c b/src/plugins/mpi/pmix/pmixp_utils.c index fb90534b30ddf11f9046e2a00b0e70280af6d1ea..bd76ef379054145742d98114efb123852850a0d1 100644 --- a/src/plugins/mpi/pmix/pmixp_utils.c +++ b/src/plugins/mpi/pmix/pmixp_utils.c @@ -417,3 +417,38 @@ int pmixp_fixrights(char *path, uid_t uid, mode_t mode) closedir(dp); return 0; } + +int pmixp_mkdir(char *path, mode_t rights) +{ + /* NOTE: we need user who owns the job to access PMIx usock + * file. According to 'man 7 unix': + * "... In the Linux implementation, sockets which are visible in the + * file system honor the permissions of the directory they are in... " + * Our case is the following: slurmstepd is usually running as root, + * user application will be "sudo'ed". To provide both of them with + * access to the unix socket we do the following: + * 1. Owner ID is set to the job owner. + * 2. Group ID corresponds to slurmstepd. + * 3. Set 0770 access mode + */ + + if (0 != mkdir(path, rights) ) { + PMIXP_ERROR_STD("Cannot create directory \"%s\"", + pmixp_info_tmpdir_lib()); + return errno; + } + + /* There might be umask that will drop essential rights. + * Fix it explicitly. + * TODO: is there more elegant solution? */ + if (chmod(path, rights) < 0) { + error("%s: chown(%s): %m", __func__, path); + return errno; + } + + if (chown(path, (uid_t) pmixp_info_jobuid(), (gid_t) -1) < 0) { + error("%s: chown(%s): %m", __func__, pmixp_info_tmpdir_lib()); + return errno; + } + return 0; +} diff --git a/src/plugins/mpi/pmix/pmixp_utils.h b/src/plugins/mpi/pmix/pmixp_utils.h index a8d17b8be74dc65b617e490c2ff15a41f066ef62..1eb78142b7beff6cac9974dd002a06d599a31bd4 100644 --- a/src/plugins/mpi/pmix/pmixp_utils.h +++ b/src/plugins/mpi/pmix/pmixp_utils.h @@ -55,5 +55,6 @@ int pmixp_stepd_send(char *nodelist, const char *address, char *data, int silent); int pmixp_rmdir_recursively(char *path); int pmixp_fixrights(char *path, uid_t uid, mode_t mode); +int pmixp_mkdir(char *path, mode_t rights); #endif /* PMIXP_UTILS_H*/