Skip to content
Snippets Groups Projects
Commit e12ba6fe authored by Moe Jette's avatar Moe Jette
Browse files

Set "/proc/self/oom_adj" for slurmd and slurmstepd daemons based upon

  the values of SLURMD_OOM_ADJ and SLURMSTEPD_OOM_ADJ environment
  variables. This can be used to prevent daemons being killed when
  a node's memory is exhausted.
parent c1b23c5f
No related branches found
No related tags found
No related merge requests found
...@@ -19,6 +19,10 @@ documents those changes that are of interest to users and admins. ...@@ -19,6 +19,10 @@ documents those changes that are of interest to users and admins.
set TopologyType=topology/tree and add configuration information set TopologyType=topology/tree and add configuration information
to a new file called topology.conf. See "man topology.conf" or to a new file called topology.conf. See "man topology.conf" or
topology.html web page for details. topology.html web page for details.
-- Set "/proc/self/oom_adj" for slurmd and slurmstepd daemons based upon
the values of SLURMD_OOM_ADJ and SLURMSTEPD_OOM_ADJ environment
variables. This can be used to prevent daemons being killed when
a node's memory is exhausted.
* Changes in SLURM 1.4.0-pre9 * Changes in SLURM 1.4.0-pre9
============================= =============================
......
...@@ -139,3 +139,7 @@ OTHER CHANGES ...@@ -139,3 +139,7 @@ OTHER CHANGES
* Modify PMI_Get_clique_ranks() to return an array of integers rather * Modify PMI_Get_clique_ranks() to return an array of integers rather
than a char * to satisfy PMI standard. Correct logic in than a char * to satisfy PMI standard. Correct logic in
PMI_Get_clique_size() for when srun --overcommit option is used. PMI_Get_clique_size() for when srun --overcommit option is used.
* Set "/proc/self/oom_adj" for slurmd and slurmstepd daemons based upon
the values of SLURMD_OOM_ADJ and SLURMSTEPD_OOM_ADJ environment
variables. This can be used to prevent daemons being killed when
a node's memory is exhausted.
...@@ -46,7 +46,6 @@ ...@@ -46,7 +46,6 @@
extern int set_oom_adj(int adj) extern int set_oom_adj(int adj)
{ {
#if 0
int fd; int fd;
char oom_adj[16]; char oom_adj[16];
...@@ -55,7 +54,7 @@ extern int set_oom_adj(int adj) ...@@ -55,7 +54,7 @@ extern int set_oom_adj(int adj)
if (errno == ENOENT) if (errno == ENOENT)
debug("failed to open /proc/self/oom_adj: %m"); debug("failed to open /proc/self/oom_adj: %m");
else else
verbose("failed to open /proc/self/oom_adj: %m"); error("failed to open /proc/self/oom_adj: %m");
return -1; return -1;
} }
if (snprintf(oom_adj, 16, "%d", adj) >= 16) { if (snprintf(oom_adj, 16, "%d", adj) >= 16) {
...@@ -64,7 +63,7 @@ extern int set_oom_adj(int adj) ...@@ -64,7 +63,7 @@ extern int set_oom_adj(int adj)
while ((write(fd, oom_adj, strlen(oom_adj)) < 0) && (errno == EINTR)) while ((write(fd, oom_adj, strlen(oom_adj)) < 0) && (errno == EINTR))
; ;
close(fd); close(fd);
#endif
return 0; return 0;
} }
......
...@@ -153,6 +153,7 @@ main (int argc, char *argv[]) ...@@ -153,6 +153,7 @@ main (int argc, char *argv[])
{ {
int i, pidfd; int i, pidfd;
int blocked_signals[] = {SIGPIPE, 0}; int blocked_signals[] = {SIGPIPE, 0};
char *oom_value;
/* /*
* Make sure we have no extra open files which * Make sure we have no extra open files which
...@@ -201,7 +202,11 @@ main (int argc, char *argv[]) ...@@ -201,7 +202,11 @@ main (int argc, char *argv[])
info("slurmd version %s started", SLURM_VERSION); info("slurmd version %s started", SLURM_VERSION);
debug3("finished daemonize"); debug3("finished daemonize");
set_oom_adj(OOM_DISABLE); if ((oom_value = getenv("SLURMD_OOM_ADJ"))) {
i = atoi(oom_value);
debug("Setting slurmd oom_adj to %d", i);
set_oom_adj(i);
}
_kill_old_slurmd(); _kill_old_slurmd();
......
...@@ -877,6 +877,7 @@ _fork_all_tasks(slurmd_job_t *job) ...@@ -877,6 +877,7 @@ _fork_all_tasks(slurmd_job_t *job)
int fdpair[2]; int fdpair[2];
struct priv_state sprivs; struct priv_state sprivs;
jobacct_id_t jobacct_id; jobacct_id_t jobacct_id;
char *oom_value;
xassert(job != NULL); xassert(job != NULL);
...@@ -929,6 +930,9 @@ _fork_all_tasks(slurmd_job_t *job) ...@@ -929,6 +930,9 @@ _fork_all_tasks(slurmd_job_t *job)
writefds[i] = fdpair[1]; writefds[i] = fdpair[1];
} }
error("setting user oom to zero");
set_oom_adj(0); /* the tasks may be killed by OOM */
/* Temporarily drop effective privileges, except for the euid. /* Temporarily drop effective privileges, except for the euid.
* We need to wait until after pam_setup() to drop euid. * We need to wait until after pam_setup() to drop euid.
*/ */
...@@ -971,8 +975,6 @@ _fork_all_tasks(slurmd_job_t *job) ...@@ -971,8 +975,6 @@ _fork_all_tasks(slurmd_job_t *job)
} else if (pid == 0) { /* child */ } else if (pid == 0) { /* child */
int j; int j;
set_oom_adj(0); /* the tasks may be killed by OOM */
#ifdef HAVE_AIX #ifdef HAVE_AIX
(void) mkcrid(0); (void) mkcrid(0);
#endif #endif
...@@ -1029,6 +1031,14 @@ _fork_all_tasks(slurmd_job_t *job) ...@@ -1029,6 +1031,14 @@ _fork_all_tasks(slurmd_job_t *job)
/* Don't bother erroring out here */ /* Don't bother erroring out here */
} }
if ((oom_value = getenv("SLURMSTEPD_OOM_ADJ"))) {
int i = atoi(oom_value);
debug("Setting slurmstepd oom_adj to %d", i);
set_oom_adj(i);
} else
error("NO SLURMSTEPD_OOM_ADJ");
if (chdir (sprivs.saved_cwd) < 0) { if (chdir (sprivs.saved_cwd) < 0) {
error ("Unable to return to working directory"); error ("Unable to return to working directory");
} }
......
...@@ -58,7 +58,6 @@ ...@@ -58,7 +58,6 @@
#include "src/slurmd/common/slurmstepd_init.h" #include "src/slurmd/common/slurmstepd_init.h"
#include "src/slurmd/common/setproctitle.h" #include "src/slurmd/common/setproctitle.h"
#include "src/slurmd/common/proctrack.h" #include "src/slurmd/common/proctrack.h"
#include "src/slurmd/common/set_oomadj.h"
#include "src/slurmd/slurmstepd/slurmstepd.h" #include "src/slurmd/slurmstepd/slurmstepd.h"
#include "src/slurmd/slurmstepd/mgr.h" #include "src/slurmd/slurmstepd/mgr.h"
#include "src/slurmd/slurmstepd/req.h" #include "src/slurmd/slurmstepd/req.h"
...@@ -102,8 +101,6 @@ main (int argc, char *argv[]) ...@@ -102,8 +101,6 @@ main (int argc, char *argv[])
exit(0); exit(0);
} }
set_oom_adj(OOM_DISABLE);
xsignal_block(slurmstepd_blocked_signals); xsignal_block(slurmstepd_blocked_signals);
conf = xmalloc(sizeof(*conf)); conf = xmalloc(sizeof(*conf));
conf->argv = &argv; conf->argv = &argv;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment