Skip to content
Snippets Groups Projects
Commit 7b7cfa93 authored by Danny Auble's avatar Danny Auble
Browse files

Merge pull request #60 from ryanbcox/slurm-2.6

task/cgroup: notify if memory.memsw.failcnt > 0. indicates oom

Thanks Ryan, this looks correct.
parents 80e83fc0 a6397bb2
No related branches found
No related tags found
No related merge requests found
...@@ -289,6 +289,9 @@ extern int task_pre_launch (slurmd_job_t *job) ...@@ -289,6 +289,9 @@ extern int task_pre_launch (slurmd_job_t *job)
*/ */
extern int task_post_term (slurmd_job_t *job) extern int task_post_term (slurmd_job_t *job)
{ {
if (use_memory) {
task_cgroup_memory_check_oom(job);
}
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
......
...@@ -450,3 +450,27 @@ extern int task_cgroup_memory_attach_task(slurmd_job_t *job) ...@@ -450,3 +450,27 @@ extern int task_cgroup_memory_attach_task(slurmd_job_t *job)
return fstatus; return fstatus;
} }
extern int task_cgroup_memory_check_oom(slurmd_job_t *job) {
xcgroup_t memory_cg;
uint64_t memory_memsw_failcnt;
size_t memory_memsw_failcnt_size;
if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) {
if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) {
/* for some reason the job cgroup limit is hit for a step and vice versa...
* can't tell which is which so we'll treat them the same */
xcgroup_get_uint64_param(&step_memory_cg, "memory.memsw.failcnt", &memory_memsw_failcnt);
if(memory_memsw_failcnt > 0)
error("Exceeded job memory limit at some point. oom-killer likely killed a process.");
xcgroup_get_uint64_param(&job_memory_cg, "memory.memsw.failcnt", &memory_memsw_failcnt);
if(memory_memsw_failcnt > 0)
error("Exceeded job memory limit at some point. oom-killer likely killed a process.");
xcgroup_unlock(&memory_cg);
} else
error("task/cgroup task_cgroup_memory_check_oom: task_cgroup_memory_check_oom: unable to lock root memcg : %m");
xcgroup_destroy(&memory_cg);
} else
error("task/cgroup task_cgroup_memory_check_oom: unable to create root memcg : %m");
return SLURM_SUCCESS;
}
...@@ -55,4 +55,6 @@ extern int task_cgroup_memory_create(slurmd_job_t *job); ...@@ -55,4 +55,6 @@ extern int task_cgroup_memory_create(slurmd_job_t *job);
/* create a task cgroup and attach the task to it */ /* create a task cgroup and attach the task to it */
extern int task_cgroup_memory_attach_task(slurmd_job_t *job); extern int task_cgroup_memory_attach_task(slurmd_job_t *job);
extern int task_cgroup_memory_check_oom(slurmd_job_t *job);
#endif #endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment