Skip to content
Snippets Groups Projects
Commit 51862f56 authored by Danny Auble's avatar Danny Auble
Browse files

Add news about oom check for task/cgroup and minor formatting

parent 84a55f34
No related branches found
No related tags found
No related merge requests found
......@@ -5,6 +5,7 @@ documents those changes that are of interest to users and admins.
========================
-- Correction to hostlist parsing bug introduced in v2.6.4 for hostlists with
more than one numeric range in brackets (e.g. rack[0-3]_blade[0-63]").
-- Add notification if using proctrack/cgroup and task/cgroup when oom hits.
* Changes in Slurm 2.6.4
========================
......
......@@ -451,11 +451,13 @@ extern int task_cgroup_memory_attach_task(slurmd_job_t *job)
return fstatus;
}
extern int task_cgroup_memory_check_oom(slurmd_job_t *job) {
extern int task_cgroup_memory_check_oom(slurmd_job_t *job)
{
xcgroup_t memory_cg;
uint64_t memory_memsw_failcnt;
if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) {
if (xcgroup_create(&memory_ns, &memory_cg, "", 0, 0)
== XCGROUP_SUCCESS) {
if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) {
/* for some reason the job cgroup limit is hit
* for a step and vice versa...
......@@ -464,14 +466,14 @@ extern int task_cgroup_memory_check_oom(slurmd_job_t *job) {
xcgroup_get_uint64_param(&step_memory_cg,
"memory.memsw.failcnt",
&memory_memsw_failcnt);
if(memory_memsw_failcnt > 0)
if (memory_memsw_failcnt > 0)
error("Exceeded step memory limit at some "
"point. oom-killer likely "
"killed a process.");
xcgroup_get_uint64_param(&job_memory_cg,
"memory.memsw.failcnt",
&memory_memsw_failcnt);
if(memory_memsw_failcnt > 0)
if (memory_memsw_failcnt > 0)
error("Exceeded job memory limit at some "
"point. oom-killer likely "
"killed a process.");
......
......@@ -55,6 +55,7 @@ extern int task_cgroup_memory_create(slurmd_job_t *job);
/* create a task cgroup and attach the task to it */
extern int task_cgroup_memory_attach_task(slurmd_job_t *job);
/* detect if oom ran on a step or job and print notice of said event */
extern int task_cgroup_memory_check_oom(slurmd_job_t *job);
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment