diff --git a/doc/man/man5/cgroup.conf.5 b/doc/man/man5/cgroup.conf.5 index 92e96dc04be28dba7493a1848bcad7b5cad61a0a..5b647cfc7f98183741555b4026529783f45e9e45 100644 --- a/doc/man/man5/cgroup.conf.5 +++ b/doc/man/man5/cgroup.conf.5 @@ -25,6 +25,12 @@ one is a proctrack plugin, the second one a task plugin. The following cgroup.conf parameters are defined to control the general behavior of Slurm cgroup plugins. +.TP +\fBCgroupMountpoint\fR=\fIPATH\fR +Specify the \fIPATH\fR under which cgroups should be mounted. This +should be a writeable directory which will contain cgroups mounted +one per subsystem. The default \fIPATH\fR is /cgroup. + .TP \fBCgroupAutomount\fR=<yes|no> Slurm cgroup plugins require valid and functional cgroup subsystem to be mounted @@ -105,17 +111,19 @@ would be added : .TP \fBAllowedRAMSpace\fR=<number> Constrain the job cgroup RAM to this percentage of the allocated memory. -The default value is 100. -If the limit is exceeded, the job steps will be killed and a warning message -will be written to standard error. -Also see \fBConstrainRAMSpace\fR. +The default value is 100. If SLURM is not allocating memory to jobs, +The percentage supplied may be expressed as floating point +number, e.g. 98.5. If the \fBAllowedRAMSpace\fR limit is exceeded, the +job steps will be killed and a warning message will be written to standard +error. Also see \fBConstrainRAMSpace\fR. .TP \fBAllowedSwapSpace\fR=<number> -Constrain the job cgroup swap space to this percentage of the allocated memory. -The default value is 0. -If the limit is exceeded, the job steps will be killed and a warning message -will be written to standard error. +Constrain the job cgroup swap space to this percentage of the allocated +memory. The default value is 0, which means that RAM+Swap will be limited +to \fBAllowedRAMSpace\fR. The supplied percentage may be expressed as a +floating point number, e.g. 50.5. If the limit is exceeded, the job steps +will be killed and a warning message will be written to standard error. Also see \fBConstrainSwapSpace\fR. .TP @@ -130,6 +138,28 @@ If configured to "yes" then constrain the job's swap space usage. The default value is "no". Also see \fBAllowedSwapSpace\fR. +.TP +\fBMaxRAMPercent\fR=\fIPERCENT\fR +Set an upper bound in percent of total RAM on the RAM constraint for a job. +This will be the memory constraint applied to jobs that are not explicitly +allocated memory by SLURM. The \fIPERCENT\fR may be an arbitrary floating +point number. The default value is 100. + +.TP +\fBMaxSwapPercent\fR=\fIPERCENT\fR +Set an upper bound (in percent of total RAM) on the amount of RAM+Swap +that may be used for a job. This will be the swap limit applied to jobs +on systems where memory is not being explicitly allocated to job. The +\fIPERCENT\fR may be an arbitrary floating point number between 0 and 100. +The default value is 100. + +.TP +\fBMinRAMSpace\fR=<number> +Set a lower bound (in MB) on the memory limits defined by +\fBAllowedRAMSpace\fR and \fBAllowedSwapSpace\fR. This prevents +accidentally creating a memory cgroup with such a low limit that slurmstepd +is immediately killed due to lack of RAM. The default limit is 30M. + .TP \fBConstrainDevices\fR=<yes|no> If configured to "yes" then constrain the job's allowed devices based on GRES diff --git a/etc/cgroup.release_common.example b/etc/cgroup.release_common.example index 6c6c4d03a25cbd0c31f11910f705db2068eac7c0..f431d26855b51c00accca33218f9656829af02df 100644 --- a/etc/cgroup.release_common.example +++ b/etc/cgroup.release_common.example @@ -13,10 +13,20 @@ # to ensure coherency of the cgroups contents. # -base_path=/cgroup progname=$(basename $0) subsystem=${progname##*_} -orphancg=${base_path}/${subsystem}/orphan + +get_mount_dir() +{ + local lssubsys=$(type -p lssubsys) + if [ -x $lssubsys ]; then + $lssubsys -m $subsystem | awk '{print $2}' + else + awk "/release_agent=$0/ { print \$2 }" + fi +} + +mountdir=$(get_mount_dir) if [[ $# -eq 0 ]] then @@ -24,14 +34,31 @@ then exit 1 fi +# build orphan cg path +if [[ $# -eq 1 ]] +then + rmcg=${mountdir}$1 +else + rmcg=${mountdir}$2 +fi +slurmcg=${rmcg%/uid_*} +if [[ ${slurmcg} == ${rmcg} ]] +then + # not a slurm job pattern, perhaps the slurmcg, just remove + # the dir with a lock and exit + flock -x ${mountdir} -c "rmdir ${rmcg}" + exit $? +fi +orphancg=${slurmcg}/orphan + # make sure orphan cgroup is existing if [[ ! -d ${orphancg} ]] then mkdir ${orphancg} case ${subsystem} in cpuset) - cat ${base_path}/${subsystem}/cpuset.cpus > ${orphancg}/cpuset.cpus - cat ${base_path}/${subsystem}/cpuset.mems > ${orphancg}/cpuset.mems + cat ${mountdir}/cpuset.cpus > ${orphancg}/cpuset.cpus + cat ${mountdir}/cpuset.mems > ${orphancg}/cpuset.mems ;; *) ;; @@ -42,7 +69,7 @@ fi if [[ $# -eq 1 ]] then - rmcg=${base_path}/${subsystem}$@ + rmcg=${mountdir}$@ # try to extract the uid cgroup from the input one # ( extract /uid_% from /uid%/job_*...) @@ -51,13 +78,13 @@ then then # not a slurm job pattern, perhaps the uidcg, just remove # the dir with a lock and exit - flock -x ${base_path}/${subsystem} -c "rmdir ${rmcg}" + flock -x ${mountdir} -c "rmdir ${rmcg}" exit $? fi - if [[ -d ${base_path}/${subsystem} ]] + if [[ -d ${mountdir} ]] then - flock -x ${base_path}/${subsystem} -c "$0 sync $@" + flock -x ${mountdir} -c "$0 sync $@" fi exit $? @@ -68,7 +95,7 @@ elif [[ $# -eq 2 ]] && [[ $1 == "sync" ]] then shift - rmcg=${base_path}/${subsystem}$@ + rmcg=${mountdir}$@ uidcg=${rmcg%/job_*} # remove this cgroup diff --git a/slurm.spec b/slurm.spec index fe85d4cb6c398fa86b25e6cc1a7be6a250c13204..1333c864fd305a2840271838ef7520e6f9f8ac31 100644 --- a/slurm.spec +++ b/slurm.spec @@ -589,10 +589,10 @@ rm -rf $RPM_BUILD_ROOT %config %{_sysconfdir}/slurm.conf.example %config %{_sysconfdir}/cgroup.conf.example %config %{_sysconfdir}/cgroup_allowed_devices_file.conf.example -%config %{_sysconfdir}/cgroup.release_common.example -%config (noreplace) %{_sysconfdir}/cgroup/release_freezer -%config (noreplace) %{_sysconfdir}/cgroup/release_cpuset -%config (noreplace) %{_sysconfdir}/cgroup/release_memory +%config (replace) %{_sysconfdir}/cgroup.release_common.example +%config (replace) %{_sysconfdir}/cgroup/release_freezer +%config (replace) %{_sysconfdir}/cgroup/release_cpuset +%config (replace) %{_sysconfdir}/cgroup/release_memory %config %{_sysconfdir}/slurm.epilog.clean %exclude %{_mandir}/man1/sjobexit* %if %{slurm_with blcr} diff --git a/src/common/xcgroup.c b/src/common/xcgroup.c index 8cc8b6376d2ddd9a364f3e23b5469314c86549c1..3da09bda6b29a7672209c9f421dbdffea7a3904c 100644 --- a/src/common/xcgroup.c +++ b/src/common/xcgroup.c @@ -93,10 +93,13 @@ int _file_write_content(char* file_path, char* content, size_t csize); * - XCGROUP_ERROR * - XCGROUP_SUCCESS */ -int xcgroup_ns_create(xcgroup_ns_t* cgns, char* mnt_point, char* mnt_args, +int xcgroup_ns_create(slurm_cgroup_conf_t *conf, + xcgroup_ns_t* cgns, char* mnt_point, char* mnt_args, char* subsys, char* notify_prog) { - cgns->mnt_point = xstrdup(mnt_point); + cgns->mnt_point = xstrdup(conf->cgroup_mountpoint); + xstrcat(cgns->mnt_point, mnt_point); + cgns->mnt_args = xstrdup(mnt_args); cgns->subsystems = xstrdup(subsys); cgns->notify_prog = xstrdup(notify_prog); @@ -822,7 +825,8 @@ int _file_write_uint64s(char* file_path, uint64_t* values, int nb) if (rc < 1) { debug2("unable to add value '%s' to file '%s' : %m", tstr, file_path); - fstatus = XCGROUP_ERROR; + if ( errno != ESRCH ) + fstatus = XCGROUP_ERROR; } } @@ -942,7 +946,8 @@ int _file_write_uint32s(char* file_path, uint32_t* values, int nb) if (rc < 1) { debug2("unable to add value '%s' to file '%s' : %m", tstr, file_path); - fstatus = XCGROUP_ERROR; + if ( errno != ESRCH ) + fstatus = XCGROUP_ERROR; } } diff --git a/src/common/xcgroup.h b/src/common/xcgroup.h index cea81f9990980fafae971c94fce0cb02d491ab4f..7b83d278889eb0a80395227bad85c8571feacae1 100644 --- a/src/common/xcgroup.h +++ b/src/common/xcgroup.h @@ -43,14 +43,11 @@ #include <sys/types.h> #include <dirent.h> +#include "xcgroup_read_config.h" #define XCGROUP_ERROR 1 #define XCGROUP_SUCCESS 0 -#ifndef CGROUP_BASEDIR -#define CGROUP_BASEDIR "/cgroup" -#endif - typedef struct xcgroup_ns { char* mnt_point; /* mount point to use for the associated cgroup */ @@ -80,7 +77,8 @@ typedef struct xcgroup { * - XCGROUP_ERROR * - XCGROUP_SUCCESS */ -int xcgroup_ns_create(xcgroup_ns_t* cgns, +int xcgroup_ns_create(slurm_cgroup_conf_t *conf, + xcgroup_ns_t* cgns, char* mnt_point,char* mnt_args, char* subsys,char* notify_prog); diff --git a/src/common/xcgroup_read_config.c b/src/common/xcgroup_read_config.c index 6f71140845259e6eca7a963e47a129d6611e86f3..48fdf7bbbf5b2025db06711708fb8bb24525eb0c 100644 --- a/src/common/xcgroup_read_config.c +++ b/src/common/xcgroup_read_config.c @@ -54,6 +54,8 @@ #include "xcgroup_read_config.h" +#define DEFAULT_CGROUP_BASEDIR "/cgroup" + slurm_cgroup_conf_t *slurm_cgroup_conf = NULL; /* Local functions */ @@ -73,14 +75,19 @@ static void _clear_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf) { if (slurm_cgroup_conf) { slurm_cgroup_conf->cgroup_automount = false ; + xfree(slurm_cgroup_conf->cgroup_mountpoint); xfree(slurm_cgroup_conf->cgroup_subsystems); xfree(slurm_cgroup_conf->cgroup_release_agent); + xfree(slurm_cgroup_conf->cgroup_prepend); slurm_cgroup_conf->constrain_cores = false ; slurm_cgroup_conf->task_affinity = false ; slurm_cgroup_conf->constrain_ram_space = false ; slurm_cgroup_conf->allowed_ram_space = 100 ; + slurm_cgroup_conf->max_ram_percent = 100 ; + slurm_cgroup_conf->min_ram_space = XCGROUP_DEFAULT_MIN_RAM; slurm_cgroup_conf->constrain_swap_space = false ; slurm_cgroup_conf->allowed_swap_space = 0 ; + slurm_cgroup_conf->max_swap_percent = 100 ; slurm_cgroup_conf->memlimit_enforcement = 0 ; slurm_cgroup_conf->memlimit_threshold = 100 ; slurm_cgroup_conf->constrain_devices = false ; @@ -88,6 +95,34 @@ static void _clear_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf) } } +/* + * Parse a floating point value in s and return in val + * Return -1 on error and leave *val unchanged. + */ +static int str_to_float (char *s, float *val) +{ + float f; + char *p; + + errno = 0; + f = strtof (s, &p); + + if ((*p != '\0') || (errno != 0)) + return (-1); + + *val = f; + return (0); +} + +static void conf_get_float (s_p_hashtbl_t *t, char *name, float *fp) +{ + char *str; + if (!s_p_get_string(&str, name, t)) + return; + if (str_to_float (str, fp) < 0) + fatal ("cgroup.conf: Invalid value '%s' for %s", str, name); +} + /* * read_slurm_cgroup_conf - load the Slurm cgroup configuration from the * cgroup.conf file. @@ -97,17 +132,21 @@ extern int read_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf) { s_p_options_t options[] = { {"CgroupAutomount", S_P_BOOLEAN}, + {"CgroupMountpoint", S_P_STRING}, {"CgroupSubsystems", S_P_STRING}, {"CgroupReleaseAgentDir", S_P_STRING}, {"ConstrainCores", S_P_BOOLEAN}, {"TaskAffinity", S_P_BOOLEAN}, {"ConstrainRAMSpace", S_P_BOOLEAN}, - {"AllowedRAMSpace", S_P_UINT32}, + {"AllowedRAMSpace", S_P_STRING}, + {"MaxRAMPercent", S_P_STRING}, + {"MinRAMSpace", S_P_UINT32}, {"ConstrainSwapSpace", S_P_BOOLEAN}, - {"AllowedSwapSpace", S_P_UINT32}, + {"AllowedSwapSpace", S_P_STRING}, + {"MaxSwapPercent", S_P_STRING}, {"ConstrainCores", S_P_BOOLEAN}, {"MemoryLimitEnforcement", S_P_BOOLEAN}, - {"MemoryLimitThreshold", S_P_UINT32}, + {"MemoryLimitThreshold", S_P_STRING}, {"ConstrainDevices", S_P_BOOLEAN}, {"AllowedDevicesFile", S_P_STRING}, {NULL} }; @@ -137,8 +176,14 @@ extern int read_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf) /* cgroup initialisation parameters */ if (!s_p_get_boolean(&slurm_cgroup_conf->cgroup_automount, - "CgroupAutomount", tbl)) + "CgroupAutomount", tbl)) slurm_cgroup_conf->cgroup_automount = false; + + if (!s_p_get_string(&slurm_cgroup_conf->cgroup_mountpoint, + "CgroupMountpoint", tbl)) + slurm_cgroup_conf->cgroup_mountpoint = + xstrdup(DEFAULT_CGROUP_BASEDIR); + s_p_get_string(&slurm_cgroup_conf->cgroup_subsystems, "CgroupSubsystems", tbl); s_p_get_string(&slurm_cgroup_conf->cgroup_release_agent, @@ -147,6 +192,13 @@ extern int read_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf) slurm_cgroup_conf->cgroup_release_agent = xstrdup("/etc/slurm/cgroup"); + /* cgroup prepend directory */ +#ifndef MULTIPLE_SLURMD + slurm_cgroup_conf->cgroup_prepend = xstrdup("/slurm"); +#else + slurm_cgroup_conf->cgroup_prepend = xstrdup("/slurm_%n"); +#endif + /* Cores constraints related conf items */ if (!s_p_get_boolean(&slurm_cgroup_conf->constrain_cores, "ConstrainCores", tbl)) @@ -159,23 +211,38 @@ extern int read_slurm_cgroup_conf(slurm_cgroup_conf_t *slurm_cgroup_conf) if (!s_p_get_boolean(&slurm_cgroup_conf->constrain_ram_space, "ConstrainRAMSpace", tbl)) slurm_cgroup_conf->constrain_ram_space = false; - if (!s_p_get_uint32(&slurm_cgroup_conf->allowed_ram_space, - "AllowedRAMSpace", tbl)) - slurm_cgroup_conf->allowed_ram_space = 100; + + conf_get_float (tbl, + "AllowedRAMSpace", + &slurm_cgroup_conf->allowed_ram_space); + + conf_get_float (tbl, + "MaxRAMPercent", + &slurm_cgroup_conf->max_ram_percent); + if (!s_p_get_boolean(&slurm_cgroup_conf->constrain_swap_space, "ConstrainSwapSpace", tbl)) slurm_cgroup_conf->constrain_swap_space = false; - if (!s_p_get_uint32(&slurm_cgroup_conf->allowed_swap_space, - "AllowedSwapSpace", tbl)) - slurm_cgroup_conf->allowed_swap_space = 0; + + conf_get_float (tbl, + "AllowedSwapSpace", + &slurm_cgroup_conf->allowed_swap_space); + + conf_get_float (tbl, + "MaxSwapPercent", + &slurm_cgroup_conf->max_swap_percent); + + s_p_get_uint32 (&slurm_cgroup_conf->min_ram_space, + "MinRAMSpace", tbl); /* Memory limits */ if (!s_p_get_boolean(&slurm_cgroup_conf->memlimit_enforcement, "MemoryLimitEnforcement", tbl)) slurm_cgroup_conf->memlimit_enforcement = false; - if (!s_p_get_uint32(&slurm_cgroup_conf->memlimit_threshold, - "MemoryLimitThreshold", tbl)) - slurm_cgroup_conf->memlimit_threshold = 0; + + conf_get_float (tbl, + "MemoryLimitThreshold", + &slurm_cgroup_conf->memlimit_threshold); /* Devices constraint related conf items */ if (!s_p_get_boolean(&slurm_cgroup_conf->constrain_devices, diff --git a/src/common/xcgroup_read_config.h b/src/common/xcgroup_read_config.h index 7693b0e2f3bc3df018215eaae20332a10c7c95b0..a3d0738fa631c13e5cc202b508635ca4c57ba127 100644 --- a/src/common/xcgroup_read_config.h +++ b/src/common/xcgroup_read_config.h @@ -50,25 +50,37 @@ #include <stdint.h> #endif /* HAVE_CONFIG_H */ +/* Default lower bound on memory limit in MB. This is required so we + * don't immediately kill slurmstepd on mem cgroup creation if + * an administrator or user sets and absurdly low mem limit. + */ +#define XCGROUP_DEFAULT_MIN_RAM 30 /* Slurm cgroup plugins configuration parameters */ typedef struct slurm_cgroup_conf { bool cgroup_automount; + char * cgroup_mountpoint; char * cgroup_subsystems; char * cgroup_release_agent; + char * cgroup_prepend; + bool constrain_cores; bool task_affinity; bool constrain_ram_space; - uint32_t allowed_ram_space; + float allowed_ram_space; + float max_ram_percent; /* Upper bound on memory as % of RAM*/ + + uint32_t min_ram_space; /* Lower bound on memory limit (MB) */ bool constrain_swap_space; - uint32_t allowed_swap_space; + float allowed_swap_space; + float max_swap_percent; /* Upper bound on swap as % of RAM */ bool memlimit_enforcement; - uint32_t memlimit_threshold; + float memlimit_threshold; bool constrain_devices; char * allowed_devices_file; diff --git a/src/plugins/proctrack/cgroup/proctrack_cgroup.c b/src/plugins/proctrack/cgroup/proctrack_cgroup.c index b38211be6b28ebb4552839a805c2eaa7c3350233..5de335def98c37d505681e1c1a8d66a15235c999 100644 --- a/src/plugins/proctrack/cgroup/proctrack_cgroup.c +++ b/src/plugins/proctrack/cgroup/proctrack_cgroup.c @@ -54,6 +54,7 @@ #include "src/common/xcgroup_read_config.h" #include "src/common/xcgroup.h" +#include "src/common/xstring.h" #include "src/common/xcpuinfo.h" #include <sys/types.h> @@ -129,7 +130,7 @@ int _slurm_cgroup_init(void) } /* initialize freezer cgroup namespace */ - if (xcgroup_ns_create(&freezer_ns, CGROUP_BASEDIR "/freezer", "", + if (xcgroup_ns_create(&slurm_cgroup_conf, &freezer_ns, "/freezer", "", "freezer", release_agent_path) != XCGROUP_SUCCESS) { error("unable to create freezer cgroup namespace"); @@ -158,15 +159,42 @@ int _slurm_cgroup_init(void) int _slurm_cgroup_create(slurmd_job_t *job, uint64_t id, uid_t uid, gid_t gid) { + /* we do it here as we do not have access to the conf structure */ + /* in libslurm (src/common/xcgroup.c) */ + xcgroup_t slurm_cg; + char* pre = (char*) xstrdup(slurm_cgroup_conf.cgroup_prepend); +#ifdef MULTIPLE_SLURMD + if ( conf->node_name != NULL ) + xstrsubstitute(pre,"%n", conf->node_name); + else { + xfree(pre); + pre = (char*) xstrdup("/slurm"); + } +#endif + + /* create slurm cgroup in the freezer ns (it could already exist) */ + if (xcgroup_create(&freezer_ns, &slurm_cg,pre, + getuid(), getgid()) != XCGROUP_SUCCESS) { + return SLURM_ERROR; + } + if (xcgroup_instanciate(&slurm_cg) != XCGROUP_SUCCESS) { + xcgroup_destroy(&slurm_cg); + return SLURM_ERROR; + } + else + xcgroup_destroy(&slurm_cg); + /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, - "/uid_%u", uid) >= PATH_MAX) { + "%s/uid_%u", pre, uid) >= PATH_MAX) { error("unable to build uid %u cgroup relative " "path : %m", uid); + xfree(pre); return SLURM_ERROR; } } + xfree(pre); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { @@ -240,13 +268,19 @@ int _slurm_cgroup_create(slurmd_job_t *job, uint64_t id, uid_t uid, gid_t gid) return SLURM_ERROR; } + /* inhibit release agent for the step cgroup thus letting + * slurmstepd being able to add new pids to the container + * when the job ends (TaskEpilog,...) */ + xcgroup_set_param(&step_freezer_cg,"notify_on_release","0"); + return SLURM_SUCCESS; } int _slurm_cgroup_destroy(void) { if (jobstep_cgroup_path[0] != '\0') { - xcgroup_delete(&step_freezer_cg); + if ( xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS ) + return SLURM_ERROR; xcgroup_destroy(&step_freezer_cg); } @@ -500,8 +534,7 @@ extern int slurm_container_plugin_signal (uint64_t id, int signal) extern int slurm_container_plugin_destroy (uint64_t id) { - _slurm_cgroup_destroy(); - return SLURM_SUCCESS; + return _slurm_cgroup_destroy(); } extern uint64_t slurm_container_plugin_find(pid_t pid) @@ -529,6 +562,7 @@ extern int slurm_container_plugin_wait(uint64_t cont_id) } /* Spin until the container is successfully destroyed */ + /* This indicates that all tasks have exited the container */ while (slurm_container_plugin_destroy(cont_id) != SLURM_SUCCESS) { slurm_container_plugin_signal(cont_id, SIGKILL); sleep(delay); diff --git a/src/plugins/task/cgroup/Makefile.am b/src/plugins/task/cgroup/Makefile.am index f7cc3e07272bfc8970c028363b1ff9964ab5a903..1813b9a4f3e4b2548d7eee33ec6cce3e087bfd30 100644 --- a/src/plugins/task/cgroup/Makefile.am +++ b/src/plugins/task/cgroup/Makefile.am @@ -9,7 +9,7 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common pkglib_LTLIBRARIES = task_cgroup.la # cgroup task plugin. -task_cgroup_la_SOURCES = task_cgroup.c \ +task_cgroup_la_SOURCES = task_cgroup.h task_cgroup.c \ task_cgroup_cpuset.h task_cgroup_cpuset.c \ task_cgroup_memory.h task_cgroup_memory.c \ task_cgroup_devices.h task_cgroup_devices.c diff --git a/src/plugins/task/cgroup/task_cgroup.c b/src/plugins/task/cgroup/task_cgroup.c index 0fbb3b90731a689a0510e772123b4202c32bd778..810927c09d27fbe2cf9a235ca6ab4223448461f0 100644 --- a/src/plugins/task/cgroup/task_cgroup.c +++ b/src/plugins/task/cgroup/task_cgroup.c @@ -45,10 +45,12 @@ #include "slurm/slurm_errno.h" #include "src/common/slurm_xlator.h" #include "src/slurmd/slurmstepd/slurmstepd_job.h" - -#include "src/common/xcgroup_read_config.h" +#include "src/slurmd/slurmd/slurmd.h" #include "src/common/xcgroup.h" +#include "src/common/xstring.h" +#include "src/common/xcgroup_read_config.h" +#include "task_cgroup.h" #include "task_cgroup_cpuset.h" #include "task_cgroup_memory.h" #include "task_cgroup_devices.h" @@ -278,3 +280,41 @@ extern int task_post_step (slurmd_job_t *job) fini(); return SLURM_SUCCESS; } + +extern char* task_cgroup_create_slurm_cg (xcgroup_ns_t* ns) { + + /* we do it here as we do not have access to the conf structure */ + /* in libslurm (src/common/xcgroup.c) */ + xcgroup_t slurm_cg; + char* pre = (char*) xstrdup(slurm_cgroup_conf.cgroup_prepend); +#ifdef MULTIPLE_SLURMD + if ( conf->node_name != NULL ) + xstrsubstitute(pre,"%n", conf->node_name); + else { + xfree(pre); + pre = (char*) xstrdup("/slurm"); + } +#endif + + /* create slurm cgroup in the ns (it could already exist) */ + if (xcgroup_create(ns,&slurm_cg,pre, + getuid(), getgid()) != XCGROUP_SUCCESS) { + xfree(pre); + return pre; + } + if (xcgroup_instanciate(&slurm_cg) != XCGROUP_SUCCESS) { + error("unable to build slurm cgroup for ns %s: %m", + ns->subsystems); + xcgroup_destroy(&slurm_cg); + xfree(pre); + return pre; + } + else { + debug3("slurm cgroup %s successfully created for ns %s: %m", + pre,ns->subsystems); + xcgroup_destroy(&slurm_cg); + } + +exit: + return pre; +} diff --git a/src/plugins/task/cgroup/task_cgroup.h b/src/plugins/task/cgroup/task_cgroup.h new file mode 100644 index 0000000000000000000000000000000000000000..a65d3a4f2edba0b55a54cb6f3afda0e5587be1e3 --- /dev/null +++ b/src/plugins/task/cgroup/task_cgroup.h @@ -0,0 +1,46 @@ +/*****************************************************************************\ + * task_cgroup.h - cgroup common primitives for task/cgroup + ***************************************************************************** + * Copyright (C) 2009 CEA/DAM/DIF + * Written by Matthieu Hautreux <matthieu.hautreux@cea.fr> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.schedmd.com/slurmdocs/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#ifndef _TASK_CGROUP_H_ +#define _TASK_CGROUP_H_ + +extern char* task_cgroup_create_slurm_cg (xcgroup_ns_t* ns); + +#endif diff --git a/src/plugins/task/cgroup/task_cgroup_cpuset.c b/src/plugins/task/cgroup/task_cgroup_cpuset.c index 59f46512d39feb5668af5d31cb80f8e62f6fa471..78df78aeaf75d37752aadc286e01fa1f8ec15e3c 100644 --- a/src/plugins/task/cgroup/task_cgroup_cpuset.c +++ b/src/plugins/task/cgroup/task_cgroup_cpuset.c @@ -52,6 +52,8 @@ #include "src/common/xcgroup.h" #include "src/common/xcpuinfo.h" +#include "task_cgroup.h" + #ifdef HAVE_HWLOC #include <hwloc.h> #include <hwloc/glibc-sched.h> @@ -94,7 +96,7 @@ extern int task_cgroup_cpuset_init(slurm_cgroup_conf_t *slurm_cgroup_conf) error("task/cgroup: unable to build cpuset release agent path"); goto error; } - if (xcgroup_ns_create(&cpuset_ns,CGROUP_BASEDIR "/cpuset","", + if (xcgroup_ns_create(slurm_cgroup_conf, &cpuset_ns, "/cpuset", "", "cpuset",release_agent_path) != XCGROUP_SUCCESS) { error("task/cgroup: unable to create cpuset namespace"); @@ -165,15 +167,45 @@ extern int task_cgroup_cpuset_create(slurmd_job_t *job) char* cpus = NULL; size_t cpus_size; + char* slurm_cgpath ; + xcgroup_t slurm_cg; + + /* create slurm root cg in this cg namespace */ + slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns); + if ( slurm_cgpath == NULL ) { + return SLURM_ERROR; + } + + /* check that this cgroup has cpus allowed or initialize them */ + if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath) + != XCGROUP_SUCCESS) { + error("task/cgroup: unable to load slurm cpuset xcgroup"); + xfree(slurm_cgpath); + return SLURM_ERROR; + } + rc = xcgroup_get_param(&slurm_cg,"cpuset.cpus",&cpus,&cpus_size); + if (rc != XCGROUP_SUCCESS || cpus_size == 1) { + /* initialize the cpusets as it was inexistant */ + if (_xcgroup_cpuset_init(&slurm_cg) != + XCGROUP_SUCCESS) { + xfree(slurm_cgpath); + xcgroup_destroy(&slurm_cg); + return SLURM_ERROR; + } + } + xfree(cpus); + /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { - if (snprintf(user_cgroup_path,PATH_MAX, - "/uid_%u",uid) >= PATH_MAX) { - error("task/cgroup: unable to build uid %u cpuset " - "cg relative path : %m",uid); + if (snprintf(user_cgroup_path, PATH_MAX, + "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { + error("unable to build uid %u cgroup relative " + "path : %m", uid); + xfree(slurm_cgpath); return SLURM_ERROR; } } + xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { diff --git a/src/plugins/task/cgroup/task_cgroup_devices.c b/src/plugins/task/cgroup/task_cgroup_devices.c index 9053bec0e91bb5eff7cbcc0e63250145cd6b5e06..6a39f87eacf138cb486ff84e5ec7ecf2e592bbaa 100644 --- a/src/plugins/task/cgroup/task_cgroup_devices.c +++ b/src/plugins/task/cgroup/task_cgroup_devices.c @@ -56,11 +56,12 @@ #include "src/common/gres.h" #include "src/common/list.h" +#include "task_cgroup.h" + #ifndef PATH_MAX #define PATH_MAX 256 #endif - static char user_cgroup_path[PATH_MAX]; static char job_cgroup_path[PATH_MAX]; static char jobstep_cgroup_path[PATH_MAX]; @@ -102,7 +103,7 @@ extern int task_cgroup_devices_init(slurm_cgroup_conf_t *slurm_cgroup_conf) error("task/cgroup: unable to build devices release agent path"); goto error; } - if ( xcgroup_ns_create(&devices_ns,CGROUP_BASEDIR "/devices","", + if (xcgroup_ns_create(slurm_cgroup_conf, &devices_ns, "/devices","", "devices",release_agent_path) != XCGROUP_SUCCESS ) { error("task/cgroup: unable to create devices namespace"); @@ -179,16 +180,25 @@ extern int task_cgroup_devices_create(slurmd_job_t *job) List job_gres_list = job->job_gres_list; List step_gres_list = job->step_gres_list; - + char* slurm_cgpath ; + + /* create slurm root cg in this cg namespace */ + slurm_cgpath = task_cgroup_create_slurm_cg(&devices_ns); + if ( slurm_cgpath == NULL ) { + return SLURM_ERROR; + } + /* build user cgroup relative path if not set (should not be) */ - if ( *user_cgroup_path == '\0' ) { - if ( snprintf(user_cgroup_path,PATH_MAX, - "/uid_%u", uid) >= PATH_MAX ) { - error("task/cgroup: unable to build uid %u devices " - "cg relative path : %m", uid); + if (*user_cgroup_path == '\0') { + if (snprintf(user_cgroup_path, PATH_MAX, + "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { + error("unable to build uid %u cgroup relative " + "path : %m", uid); + xfree(slurm_cgpath); return SLURM_ERROR; } } + xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if ( *job_cgroup_path == '\0' ) { diff --git a/src/plugins/task/cgroup/task_cgroup_memory.c b/src/plugins/task/cgroup/task_cgroup_memory.c index 70de4c76f2d651e6c0160ec2b34b36320d243e4f..a7e0b0de711ef9cf869de112ab8ece60f0ac77fc 100644 --- a/src/plugins/task/cgroup/task_cgroup_memory.c +++ b/src/plugins/task/cgroup/task_cgroup_memory.c @@ -50,10 +50,14 @@ #include "src/common/xcgroup_read_config.h" #include "src/common/xcgroup.h" +#include "task_cgroup.h" + #ifndef PATH_MAX #define PATH_MAX 256 #endif +extern slurmd_conf_t *conf; + static char user_cgroup_path[PATH_MAX]; static char job_cgroup_path[PATH_MAX]; static char jobstep_cgroup_path[PATH_MAX]; @@ -64,9 +68,18 @@ static xcgroup_t user_memory_cg; static xcgroup_t job_memory_cg; static xcgroup_t step_memory_cg; -static int allowed_ram_space; -static int allowed_swap_space; +static float allowed_ram_space; /* Allowed RAM in percent */ +static float allowed_swap_space; /* Allowed Swap percent */ + +static uint64_t max_ram; /* Upper bound for memory.limit_in_bytes */ +static uint64_t max_swap; /* Upper bound for swap */ +static uint64_t totalram; /* Total real memory available on node */ +static uint64_t min_ram_space; /* Don't constrain RAM below this value */ +static uint64_t percent_in_bytes (uint64_t mb, float percent) +{ + return ((mb * 1024 * 1024) * (percent / 100.0)); +} extern int task_cgroup_memory_init(slurm_cgroup_conf_t *slurm_cgroup_conf) { @@ -84,7 +97,7 @@ extern int task_cgroup_memory_init(slurm_cgroup_conf_t *slurm_cgroup_conf) error("task/cgroup: unable to build memory release agent path"); goto error; } - if (xcgroup_ns_create(&memory_ns,CGROUP_BASEDIR "/memory","", + if (xcgroup_ns_create(slurm_cgroup_conf, &memory_ns, "/memory", "", "memory",release_agent_path) != XCGROUP_SUCCESS) { error("task/cgroup: unable to create memory namespace"); @@ -110,6 +123,25 @@ extern int task_cgroup_memory_init(slurm_cgroup_conf_t *slurm_cgroup_conf) allowed_ram_space = slurm_cgroup_conf->allowed_ram_space; allowed_swap_space = slurm_cgroup_conf->allowed_swap_space; + if ((totalram = (uint64_t) conf->real_memory_size) == 0) + error ("task/cgroup: Unable to get RealMemory size"); + + max_ram = percent_in_bytes(totalram, slurm_cgroup_conf->max_ram_percent); + max_swap = percent_in_bytes(totalram, slurm_cgroup_conf->max_swap_percent); + max_swap += max_ram; + min_ram_space = slurm_cgroup_conf->min_ram_space * 1024 * 1024; + + debug ("task/cgroup/memory: total:%luM allowed:%.4g%%, swap:%.4g%%, " + "max:%.4g%%(%luM) max+swap:%.4g%%(%luM) min:%uM", + (unsigned long) totalram, + allowed_ram_space, + allowed_swap_space, + slurm_cgroup_conf->max_ram_percent, + (unsigned long) (max_ram/(1024*1024)), + slurm_cgroup_conf->max_swap_percent, + (unsigned long) (max_swap/(1024*1024)), + (unsigned) slurm_cgroup_conf->min_ram_space); + /* * Warning: OOM Killer must be disabled for slurmstepd * or it would be destroyed if the application use @@ -171,6 +203,76 @@ extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) return SLURM_SUCCESS; } +/* + * Return configured memory limit in bytes given a memory limit in MB. + */ +static uint64_t mem_limit_in_bytes (uint64_t mem) +{ + /* + * If mem == 0 then assume there was no SLURM limit imposed + * on the amount of memory for job or step. Use the total + * amount of available RAM instead. + */ + if (mem == 0) + mem = totalram * 1024 * 1024; + else + mem = percent_in_bytes (mem, allowed_ram_space); + if (mem < min_ram_space) + return (min_ram_space); + if (mem > max_ram) + return (max_ram); + return (mem); +} + +/* + * Return configured swap limit in bytes given a memory limit in MB. + * + * Swap limit is calculated as: + * + * mem_limit_in_bytes + (configured_swap_percent * allocated_mem_in_bytes) + */ +static uint64_t swap_limit_in_bytes (uint64_t mem) +{ + uint64_t swap; + /* + * If mem == 0 assume "unlimited" and use totalram. + */ + swap = percent_in_bytes (mem ? mem : totalram, allowed_swap_space); + mem = mem_limit_in_bytes (mem) + swap; + if (mem < min_ram_space) + return (min_ram_space); + if (mem > max_swap) + return (max_swap); + return (mem); +} + +static int memcg_initialize (xcgroup_ns_t *ns, xcgroup_t *cg, + char *path, uint64_t mem_limit, uid_t uid, gid_t gid) +{ + uint64_t mlb = mem_limit_in_bytes (mem_limit); + uint64_t mls = swap_limit_in_bytes (mem_limit); + + if (xcgroup_create (ns, cg, path, uid, gid) != XCGROUP_SUCCESS) + return -1; + + if (xcgroup_instanciate (cg) != XCGROUP_SUCCESS) { + xcgroup_destroy (cg); + return -1; + } + + xcgroup_set_param (cg, "memory.use_hierarchy","1"); + xcgroup_set_uint64_param (cg, "memory.limit_in_bytes", mlb); + xcgroup_set_uint64_param (cg, "memory.memsw.limit_in_bytes", mls); + + info ("task/cgroup: %s: alloc=%luMB mem.limit=%luMB memsw.limit=%luMB", + path, + (unsigned long) mem_limit, + (unsigned long) mlb/(1024*1024), + (unsigned long) mls/(1024*1024)); + + return 0; +} + extern int task_cgroup_memory_create(slurmd_job_t *job) { int rc; @@ -181,19 +283,28 @@ extern int task_cgroup_memory_create(slurmd_job_t *job) uint32_t jobid = job->jobid; uint32_t stepid = job->stepid; uid_t uid = job->uid; - uid_t gid = job->gid; + gid_t gid = job->gid; pid_t pid; - uint64_t ml,mlb,mls; + + char* slurm_cgpath ; + + /* create slurm root cg in this cg namespace */ + slurm_cgpath = task_cgroup_create_slurm_cg(&memory_ns); + if ( slurm_cgpath == NULL ) { + return SLURM_ERROR; + } /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { - if (snprintf(user_cgroup_path,PATH_MAX, - "/uid_%u",uid) >= PATH_MAX) { - error("task/cgroup: unable to build uid %u memory " - "cg relative path : %m",uid); + if (snprintf(user_cgroup_path, PATH_MAX, + "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { + error("unable to build uid %u cgroup relative " + "path : %m", uid); + xfree(slurm_cgpath); return SLURM_ERROR; } } + xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { @@ -265,58 +376,22 @@ extern int task_cgroup_memory_create(slurmd_job_t *job) * container in order to guarantee that a job will stay on track * regardless of the consumption of each step. */ - ml = (uint64_t) job->job_mem; - ml = ml * 1024 * 1024 ; - mlb = (uint64_t) (ml * (allowed_ram_space / 100.0)) ; - mls = (uint64_t) mlb + (ml * (allowed_swap_space / 100.0)) ; - if (xcgroup_create(&memory_ns,&job_memory_cg, - job_cgroup_path, - getuid(),getgid()) != XCGROUP_SUCCESS) { - xcgroup_destroy(&user_memory_cg); - goto error; - } - if (xcgroup_instanciate(&job_memory_cg) != XCGROUP_SUCCESS) { - xcgroup_destroy(&user_memory_cg); - xcgroup_destroy(&job_memory_cg); + if (memcg_initialize (&memory_ns, &job_memory_cg, job_cgroup_path, + job->job_mem, getuid(), getgid()) < 0) { + xcgroup_destroy (&user_memory_cg); goto error; } - xcgroup_set_param(&job_memory_cg,"memory.use_hierarchy","1"); - xcgroup_set_uint64_param(&job_memory_cg, - "memory.limit_in_bytes",mlb); - xcgroup_set_uint64_param(&job_memory_cg, - "memory.memsw.limit_in_bytes",mls); - debug("task/cgroup: job mem.limit=%"PRIu64"MB memsw.limit=%"PRIu64"MB", - mlb/(1024*1024),mls/(1024*1024)); /* * Create step cgroup in the memory ns (it should not exists) * and set the associated memory limits. */ - ml = (uint64_t) job->step_mem; - ml = ml * 1024 * 1024 ; - mlb = (uint64_t) (ml * (allowed_ram_space / 100.0)) ; - mls = (uint64_t) mlb + (ml * (allowed_swap_space / 100.0)) ; - if (xcgroup_create(&memory_ns,&step_memory_cg, - jobstep_cgroup_path, - uid,gid) != XCGROUP_SUCCESS) { - /* do not delete user/job cgroup as */ - /* they can exist for other steps */ - xcgroup_destroy(&user_memory_cg); - xcgroup_destroy(&job_memory_cg); - goto error; - } - if (xcgroup_instanciate(&step_memory_cg) != XCGROUP_SUCCESS) { + if (memcg_initialize (&memory_ns, &step_memory_cg, jobstep_cgroup_path, + job->step_mem, uid, gid) < 0) { xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); - xcgroup_destroy(&step_memory_cg); goto error; } - xcgroup_set_uint64_param(&step_memory_cg, - "memory.limit_in_bytes",mlb); - xcgroup_set_uint64_param(&step_memory_cg, - "memory.memsw.limit_in_bytes",mls); - debug("task/cgroup: step mem.limit=%"PRIu64"MB memsw.limit=%"PRIu64"MB", - mlb/(1024*1024),mls/(1024*1024)); /* * Attach the slurmstepd to the step memory cgroup diff --git a/src/slurmd/common/slurmstepd_init.c b/src/slurmd/common/slurmstepd_init.c index 4aa8debc82459cf51f0ae620e61ad20624205df1..e33b7b28ca43439fe893eb4ca007b7c7bb2a5383 100644 --- a/src/slurmd/common/slurmstepd_init.c +++ b/src/slurmd/common/slurmstepd_init.c @@ -45,6 +45,7 @@ extern void pack_slurmd_conf_lite(slurmd_conf_t *conf, Buf buffer) pack16(conf->sockets, buffer); pack16(conf->cores, buffer); pack16(conf->threads, buffer); + pack32(conf->real_memory_size, buffer); packstr(conf->spooldir, buffer); packstr(conf->node_name, buffer); packstr(conf->logfile, buffer); @@ -70,6 +71,7 @@ extern int unpack_slurmd_conf_lite_no_alloc(slurmd_conf_t *conf, Buf buffer) safe_unpack16(&conf->sockets, buffer); safe_unpack16(&conf->cores, buffer); safe_unpack16(&conf->threads, buffer); + safe_unpack32(&conf->real_memory_size, buffer); safe_unpackstr_xmalloc(&conf->spooldir, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&conf->node_name, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&conf->logfile, &uint32_tmp, buffer); diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 9bb8271bfbb430248915d17454083a0b9d636089..55e9d549872036f9d5c13ae1449e007662da96a9 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -75,6 +75,13 @@ # include <stdlib.h> #endif +#ifdef HAVE_PTY_H +# include <pty.h> +# ifdef HAVE_UTMP_H +# include <utmp.h> +# endif +#endif + #include "slurm/slurm_errno.h" #include "src/common/cbuf.h" @@ -1088,6 +1095,110 @@ _spank_task_privileged(slurmd_job_t *job, int taskid, struct priv_state *sp) return(_drop_privileges (job, true, sp)); } +struct exec_wait_info { + int id; + pid_t pid; + int parentfd; + int childfd; +}; + +static struct exec_wait_info * exec_wait_info_create (int i) +{ + int fdpair[2]; + struct exec_wait_info * e; + + if (pipe (fdpair) < 0) { + error ("exec_wait_info_create: pipe: %m"); + return NULL; + } + + fd_set_close_on_exec(fdpair[0]); + fd_set_close_on_exec(fdpair[1]); + + e = xmalloc (sizeof (*e)); + e->childfd = fdpair[0]; + e->parentfd = fdpair[1]; + e->id = i; + e->pid = -1; + + return (e); +} + +static void exec_wait_info_destroy (struct exec_wait_info *e) +{ + if (e == NULL) + return; + + close (e->parentfd); + close (e->childfd); + e->id = -1; + e->pid = -1; +} + +static pid_t exec_wait_get_pid (struct exec_wait_info *e) +{ + if (e == NULL) + return (-1); + return (e->pid); +} + +static struct exec_wait_info * fork_child_with_wait_info (int id) +{ + struct exec_wait_info *e; + + if (!(e = exec_wait_info_create (id))) + return (NULL); + + if ((e->pid = fork ()) < 0) { + exec_wait_info_destroy (e); + return (NULL); + } + else if (e->pid == 0) /* In child, close parent fd */ + close (e->parentfd); + + return (e); +} + +static int exec_wait_child_wait_for_parent (struct exec_wait_info *e) +{ + char c; + + if (read (e->childfd, &c, sizeof (c)) != 1) + return error ("wait_for_parent: failed: %m"); + + return (0); +} + +static int exec_wait_signal_child (struct exec_wait_info *e) +{ + char c = '\0'; + + if (write (e->parentfd, &c, sizeof (c)) != 1) + return error ("write to unblock task %d failed: %m", e->id); + + return (0); +} + +static int exec_wait_signal (struct exec_wait_info *e, slurmd_job_t *job) +{ + debug3 ("Unblocking %u.%u task %d, writefd = %d", + job->jobid, job->stepid, e->id, e->parentfd); + exec_wait_signal_child (e); + return (0); +} + +static void prepare_tty (slurmd_job_t *job, slurmd_task_info_t *task) +{ +#ifdef HAVE_PTY_H + if (job->pty && (task->gtid == 0)) { + if (login_tty(task->stdin_fd)) + error("login_tty: %m"); + else + debug3("login_tty good"); + } +#endif + return; +} /* fork and exec N tasks */ @@ -1096,12 +1207,10 @@ _fork_all_tasks(slurmd_job_t *job) { int rc = SLURM_SUCCESS; int i; - int *writefds; /* array of write file descriptors */ - int *readfds; /* array of read file descriptors */ - int fdpair[2]; struct priv_state sprivs; jobacct_id_t jobacct_id; char *oom_value; + List exec_wait_list = NULL; xassert(job != NULL); @@ -1118,36 +1227,6 @@ _fork_all_tasks(slurmd_job_t *job) } debug2("After call to spank_init()"); - /* - * Pre-allocate a pipe for each of the tasks - */ - debug3("num tasks on this node = %d", job->node_tasks); - writefds = (int *) xmalloc (job->node_tasks * sizeof(int)); - if (!writefds) { - error("writefds xmalloc failed!"); - return SLURM_ERROR; - } - readfds = (int *) xmalloc (job->node_tasks * sizeof(int)); - if (!readfds) { - error("readfds xmalloc failed!"); - return SLURM_ERROR; - } - - - for (i = 0; i < job->node_tasks; i++) { - fdpair[0] = -1; fdpair[1] = -1; - if (pipe (fdpair) < 0) { - error ("exec_all_tasks: pipe: %m"); - return SLURM_ERROR; - } - debug3("New fdpair[0] = %d, fdpair[1] = %d", - fdpair[0], fdpair[1]); - fd_set_close_on_exec(fdpair[0]); - fd_set_close_on_exec(fdpair[1]); - readfds[i] = fdpair[0]; - writefds[i] = fdpair[1]; - } - set_oom_adj(0); /* the tasks may be killed by OOM */ if (pre_setuid(job)) { error("Failed task affinity setup"); @@ -1185,27 +1264,33 @@ _fork_all_tasks(slurmd_job_t *job) return SLURM_ERROR; } + exec_wait_list = list_create ((ListDelF) exec_wait_info_destroy); + if (!exec_wait_list) + return error ("Unable to create exec_wait_list"); + /* * Fork all of the task processes. */ for (i = 0; i < job->node_tasks; i++) { char time_stamp[256]; pid_t pid; - if ((pid = fork ()) < 0) { + struct exec_wait_info *ei; + + if ((ei = fork_child_with_wait_info (i)) == NULL) { error("child fork: %m"); goto fail2; - } else if (pid == 0) { /* child */ - int j; + } else if ((pid = exec_wait_get_pid (ei)) == 0) { /* child */ + /* + * Destroy exec_wait_list in the child. + * Only exec_wait_info for previous tasks have been + * added to the list so far, so everything else + * can be discarded. + */ + list_destroy (exec_wait_list); #ifdef HAVE_AIX (void) mkcrid(0); #endif - /* Close file descriptors not needed by the child */ - for (j = 0; j < job->node_tasks; j++) { - close(writefds[j]); - if (j > i) - close(readfds[j]); - } /* jobacct_gather_g_endpoll(); * closing jobacct files here causes deadlock */ @@ -1229,14 +1314,28 @@ _fork_all_tasks(slurmd_job_t *job) xsignal_unblock(slurmstepd_blocked_signals); - exec_task(job, i, readfds[i]); + /* + * Setup tty before any setpgid() calls + */ + prepare_tty (job, job->task[i]); + + /* + * Block until parent notifies us that it is ok to + * proceed. This allows the parent to place all + * children in any process groups or containers + * before they make a call to exec(2). + */ + exec_wait_child_wait_for_parent (ei); + + exec_task(job, i); } /* * Parent continues: */ - close(readfds[i]); + list_append (exec_wait_list, ei); + LOG_TIMESTAMP(time_stamp); verbose ("task %lu (%lu) started %s", (unsigned long) job->task[i]->gtid, @@ -1306,16 +1405,10 @@ _fork_all_tasks(slurmd_job_t *job) /* * Now it's ok to unblock the tasks, so they may call exec. */ - for (i = 0; i < job->node_tasks; i++) { - char c = '\0'; - - debug3("Unblocking %u.%u task %d, writefd = %d", - job->jobid, job->stepid, i, writefds[i]); - if (write (writefds[i], &c, sizeof (c)) != 1) - error ("write to unblock task %d failed", i); - - close(writefds[i]); + list_for_each (exec_wait_list, (ListForF) exec_wait_signal, job); + list_destroy (exec_wait_list); + for (i = 0; i < job->node_tasks; i++) { /* * Prepare process for attach by parallel debugger * (if specified and able) @@ -1324,17 +1417,14 @@ _fork_all_tasks(slurmd_job_t *job) == SLURM_ERROR) rc = SLURM_ERROR; } - xfree(writefds); - xfree(readfds); return rc; fail2: _reclaim_privileges (&sprivs); + if (exec_wait_list) + list_destroy (exec_wait_list); fail1: - xfree(writefds); - xfree(readfds); - pam_finish(); return SLURM_ERROR; } @@ -2124,6 +2214,7 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, { int status, rc, opt; pid_t cpid; + struct exec_wait_info *ei; xassert(env); if (path == NULL || path[0] == '\0') @@ -2140,11 +2231,11 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, (slurm_container_create(job) != SLURM_SUCCESS)) error("slurm_container_create: %m"); - if ((cpid = fork()) < 0) { + if ((ei = fork_child_with_wait_info(0)) == NULL) { error ("executing %s: fork: %m", name); return -1; } - if (cpid == 0) { + if ((cpid = exec_wait_get_pid (ei)) == 0) { struct priv_state sprivs; char *argv[2]; @@ -2171,6 +2262,11 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, #else setpgrp(); #endif + /* + * Wait for signal from parent + */ + exec_wait_child_wait_for_parent (ei); + execve(path, argv, env); error("execve(): %m"); exit(127); @@ -2178,6 +2274,11 @@ _run_script_as_user(const char *name, const char *path, slurmd_job_t *job, if (slurm_container_add(job, cpid) != SLURM_SUCCESS) error("slurm_container_add: %m"); + + if (exec_wait_signal_child (ei) < 0) + error ("run_script_as_user: Failed to wakeup %s", name); + exec_wait_info_destroy (ei); + if (max_wait < 0) opt = 0; else diff --git a/src/slurmd/slurmstepd/task.c b/src/slurmd/slurmstepd/task.c index 4462596b35f0c3d46b613639d2b48d35ddd48c92..925e67e4ab0bc037bff252f05d5d268f8df3aaba 100644 --- a/src/slurmd/slurmstepd/task.c +++ b/src/slurmd/slurmstepd/task.c @@ -65,14 +65,6 @@ # include <sys/checkpnt.h> #endif -#ifdef HAVE_PTY_H -# include <pty.h> -#endif - -#ifdef HAVE_UTMP_H -# include <utmp.h> -#endif - #include <sys/resource.h> #include "slurm/slurm_errno.h" @@ -337,37 +329,15 @@ _setup_mpi(slurmd_job_t *job, int ltaskid) * Current process is running as the user when this is called. */ void -exec_task(slurmd_job_t *job, int i, int waitfd) +exec_task(slurmd_job_t *job, int i) { - char c; uint32_t *gtids; /* pointer to arrary of ranks */ int fd, j; - int rc; slurmd_task_info_t *task = job->task[i]; -#ifdef HAVE_PTY_H - /* Execute login_tty() before setpgid() calls */ - if (job->pty && (task->gtid == 0)) { - if (login_tty(task->stdin_fd)) - error("login_tty: %m"); - else - debug3("login_tty good"); - } -#endif - if (i == 0) _make_tmpdir(job); - /* - * Stall exec until all tasks have joined the same process group - */ - if ((rc = read (waitfd, &c, sizeof (c))) != 1) { - error ("_exec_task read failed, fd = %d, rc=%d: %m", waitfd, rc); - log_fini(); - exit(1); - } - close(waitfd); - gtids = xmalloc(job->node_tasks * sizeof(uint32_t)); for (j = 0; j < job->node_tasks; j++) gtids[j] = job->task[j]->gtid; @@ -422,14 +392,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd) } } -#ifdef HAVE_PTY_H - if (job->pty && (task->gtid == 0)) { - /* Need to perform the login_tty() before all tasks - * register and the process groups are reset, otherwise - * login_tty() gets disabled */ - } else -#endif - io_dup_stdio(task); + io_dup_stdio(task); /* task-specific pre-launch activities */ diff --git a/src/slurmd/slurmstepd/task.h b/src/slurmd/slurmstepd/task.h index d067df52dca3f090966a1073dafae42748fc2421..78c0b6058cdc402411bff87e710adba65f9538ea 100644 --- a/src/slurmd/slurmstepd/task.h +++ b/src/slurmd/slurmstepd/task.h @@ -52,6 +52,6 @@ #include "src/slurmd/slurmstepd/slurmstepd_job.h" -void exec_task(slurmd_job_t *job, int i, int waitfd); +void exec_task(slurmd_job_t *job, int i); #endif /* !_TASK_H */