diff --git a/src/plugins/task/cgroup/task_cgroup.c b/src/plugins/task/cgroup/task_cgroup.c index 239b16aad4faec892d8d8da527ad212638b24875..c9147ba68c157bd2959dea89f69defc180377f21 100644 --- a/src/plugins/task/cgroup/task_cgroup.c +++ b/src/plugins/task/cgroup/task_cgroup.c @@ -289,6 +289,9 @@ extern int task_pre_launch (slurmd_job_t *job) */ extern int task_post_term (slurmd_job_t *job) { + if (use_memory) { + task_cgroup_memory_check_oom(job); + } return SLURM_SUCCESS; } diff --git a/src/plugins/task/cgroup/task_cgroup_memory.c b/src/plugins/task/cgroup/task_cgroup_memory.c index 4558ec37b269b17b952f37b3a2d71c2e49dde27a..626874ae0c3d12e12accdcac0fbb5f0602305cdc 100644 --- a/src/plugins/task/cgroup/task_cgroup_memory.c +++ b/src/plugins/task/cgroup/task_cgroup_memory.c @@ -450,3 +450,27 @@ extern int task_cgroup_memory_attach_task(slurmd_job_t *job) return fstatus; } + +extern int task_cgroup_memory_check_oom(slurmd_job_t *job) { + xcgroup_t memory_cg; + uint64_t memory_memsw_failcnt; + size_t memory_memsw_failcnt_size; + + if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) { + if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) { + /* for some reason the job cgroup limit is hit for a step and vice versa... + * can't tell which is which so we'll treat them the same */ + xcgroup_get_uint64_param(&step_memory_cg, "memory.memsw.failcnt", &memory_memsw_failcnt); + if(memory_memsw_failcnt > 0) + error("Exceeded job memory limit at some point. oom-killer likely killed a process."); + xcgroup_get_uint64_param(&job_memory_cg, "memory.memsw.failcnt", &memory_memsw_failcnt); + if(memory_memsw_failcnt > 0) + error("Exceeded job memory limit at some point. oom-killer likely killed a process."); + xcgroup_unlock(&memory_cg); + } else + error("task/cgroup task_cgroup_memory_check_oom: task_cgroup_memory_check_oom: unable to lock root memcg : %m"); + xcgroup_destroy(&memory_cg); + } else + error("task/cgroup task_cgroup_memory_check_oom: unable to create root memcg : %m"); + return SLURM_SUCCESS; +} diff --git a/src/plugins/task/cgroup/task_cgroup_memory.h b/src/plugins/task/cgroup/task_cgroup_memory.h index bda89804865c72aeeea0371800b2e09f7dbb041d..a631e626a49e10a9dfd0e81131d29f7a457a4b66 100644 --- a/src/plugins/task/cgroup/task_cgroup_memory.h +++ b/src/plugins/task/cgroup/task_cgroup_memory.h @@ -55,4 +55,6 @@ extern int task_cgroup_memory_create(slurmd_job_t *job); /* create a task cgroup and attach the task to it */ extern int task_cgroup_memory_attach_task(slurmd_job_t *job); +extern int task_cgroup_memory_check_oom(slurmd_job_t *job); + #endif