From a6397bb2650f945196c68b5f44b023c42eac04ff Mon Sep 17 00:00:00 2001 From: Ryan Cox <ryan_cox@byu.edu> Date: Thu, 7 Nov 2013 16:39:56 -0700 Subject: [PATCH] task/cgroup: notify if memory.memsw.failcnt > 0. indicates oom --- src/plugins/task/cgroup/task_cgroup.c | 3 +++ src/plugins/task/cgroup/task_cgroup_memory.c | 24 ++++++++++++++++++++ src/plugins/task/cgroup/task_cgroup_memory.h | 2 ++ 3 files changed, 29 insertions(+) diff --git a/src/plugins/task/cgroup/task_cgroup.c b/src/plugins/task/cgroup/task_cgroup.c index 239b16aad4f..c9147ba68c1 100644 --- a/src/plugins/task/cgroup/task_cgroup.c +++ b/src/plugins/task/cgroup/task_cgroup.c @@ -289,6 +289,9 @@ extern int task_pre_launch (slurmd_job_t *job) */ extern int task_post_term (slurmd_job_t *job) { + if (use_memory) { + task_cgroup_memory_check_oom(job); + } return SLURM_SUCCESS; } diff --git a/src/plugins/task/cgroup/task_cgroup_memory.c b/src/plugins/task/cgroup/task_cgroup_memory.c index 4558ec37b26..626874ae0c3 100644 --- a/src/plugins/task/cgroup/task_cgroup_memory.c +++ b/src/plugins/task/cgroup/task_cgroup_memory.c @@ -450,3 +450,27 @@ extern int task_cgroup_memory_attach_task(slurmd_job_t *job) return fstatus; } + +extern int task_cgroup_memory_check_oom(slurmd_job_t *job) { + xcgroup_t memory_cg; + uint64_t memory_memsw_failcnt; + size_t memory_memsw_failcnt_size; + + if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) { + if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) { + /* for some reason the job cgroup limit is hit for a step and vice versa... + * can't tell which is which so we'll treat them the same */ + xcgroup_get_uint64_param(&step_memory_cg, "memory.memsw.failcnt", &memory_memsw_failcnt); + if(memory_memsw_failcnt > 0) + error("Exceeded job memory limit at some point. oom-killer likely killed a process."); + xcgroup_get_uint64_param(&job_memory_cg, "memory.memsw.failcnt", &memory_memsw_failcnt); + if(memory_memsw_failcnt > 0) + error("Exceeded job memory limit at some point. oom-killer likely killed a process."); + xcgroup_unlock(&memory_cg); + } else + error("task/cgroup task_cgroup_memory_check_oom: task_cgroup_memory_check_oom: unable to lock root memcg : %m"); + xcgroup_destroy(&memory_cg); + } else + error("task/cgroup task_cgroup_memory_check_oom: unable to create root memcg : %m"); + return SLURM_SUCCESS; +} diff --git a/src/plugins/task/cgroup/task_cgroup_memory.h b/src/plugins/task/cgroup/task_cgroup_memory.h index bda89804865..a631e626a49 100644 --- a/src/plugins/task/cgroup/task_cgroup_memory.h +++ b/src/plugins/task/cgroup/task_cgroup_memory.h @@ -55,4 +55,6 @@ extern int task_cgroup_memory_create(slurmd_job_t *job); /* create a task cgroup and attach the task to it */ extern int task_cgroup_memory_attach_task(slurmd_job_t *job); +extern int task_cgroup_memory_check_oom(slurmd_job_t *job); + #endif -- GitLab