Skip to content
Snippets Groups Projects
Commit 57cc39dc authored by Danny Auble's avatar Danny Auble
Browse files

Merge remote-tracking branch 'origin/slurm-2.6'

parents fd9f4fb9 51862f56
No related branches found
No related tags found
No related merge requests found
......@@ -136,6 +136,7 @@ documents those changes that are of interest to users and admins.
========================
-- Correction to hostlist parsing bug introduced in v2.6.4 for hostlists with
more than one numeric range in brackets (e.g. rack[0-3]_blade[0-63]").
-- Add notification if using proctrack/cgroup and task/cgroup when oom hits.
* Changes in Slurm 2.6.4
========================
......
......@@ -289,6 +289,9 @@ extern int task_p_pre_launch (stepd_step_rec_t *job)
*/
extern int task_p_post_term (stepd_step_rec_t *job, stepd_step_task_info_t *task)
{
if (use_memory) {
task_cgroup_memory_check_oom(job);
}
return SLURM_SUCCESS;
}
......
......@@ -450,3 +450,41 @@ extern int task_cgroup_memory_attach_task(stepd_step_rec_t *job)
return fstatus;
}
extern int task_cgroup_memory_check_oom(slurmd_job_t *job)
{
xcgroup_t memory_cg;
uint64_t memory_memsw_failcnt;
if (xcgroup_create(&memory_ns, &memory_cg, "", 0, 0)
== XCGROUP_SUCCESS) {
if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) {
/* for some reason the job cgroup limit is hit
* for a step and vice versa...
* can't tell which is which so we'll treat
* them the same */
xcgroup_get_uint64_param(&step_memory_cg,
"memory.memsw.failcnt",
&memory_memsw_failcnt);
if (memory_memsw_failcnt > 0)
error("Exceeded step memory limit at some "
"point. oom-killer likely "
"killed a process.");
xcgroup_get_uint64_param(&job_memory_cg,
"memory.memsw.failcnt",
&memory_memsw_failcnt);
if (memory_memsw_failcnt > 0)
error("Exceeded job memory limit at some "
"point. oom-killer likely "
"killed a process.");
xcgroup_unlock(&memory_cg);
} else
error("task/cgroup task_cgroup_memory_check_oom: "
"task_cgroup_memory_check_oom: unable to lock "
"root memcg : %m");
xcgroup_destroy(&memory_cg);
} else
error("task/cgroup task_cgroup_memory_check_oom: "
"unable to create root memcg : %m");
return SLURM_SUCCESS;
}
......@@ -55,4 +55,7 @@ extern int task_cgroup_memory_create(stepd_step_rec_t *job);
/* create a task cgroup and attach the task to it */
extern int task_cgroup_memory_attach_task(stepd_step_rec_t *job);
/* detect if oom ran on a step or job and print notice of said event */
extern int task_cgroup_memory_check_oom(slurmd_job_t *job);
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment