From a6397bb2650f945196c68b5f44b023c42eac04ff Mon Sep 17 00:00:00 2001
From: Ryan Cox <ryan_cox@byu.edu>
Date: Thu, 7 Nov 2013 16:39:56 -0700
Subject: [PATCH] task/cgroup: notify if memory.memsw.failcnt > 0. indicates
 oom

---
 src/plugins/task/cgroup/task_cgroup.c        |  3 +++
 src/plugins/task/cgroup/task_cgroup_memory.c | 24 ++++++++++++++++++++
 src/plugins/task/cgroup/task_cgroup_memory.h |  2 ++
 3 files changed, 29 insertions(+)

diff --git a/src/plugins/task/cgroup/task_cgroup.c b/src/plugins/task/cgroup/task_cgroup.c
index 239b16aad4f..c9147ba68c1 100644
--- a/src/plugins/task/cgroup/task_cgroup.c
+++ b/src/plugins/task/cgroup/task_cgroup.c
@@ -289,6 +289,9 @@ extern int task_pre_launch (slurmd_job_t *job)
  */
 extern int task_post_term (slurmd_job_t *job)
 {
+	if (use_memory) {
+		task_cgroup_memory_check_oom(job);
+	}
 	return SLURM_SUCCESS;
 }
 
diff --git a/src/plugins/task/cgroup/task_cgroup_memory.c b/src/plugins/task/cgroup/task_cgroup_memory.c
index 4558ec37b26..626874ae0c3 100644
--- a/src/plugins/task/cgroup/task_cgroup_memory.c
+++ b/src/plugins/task/cgroup/task_cgroup_memory.c
@@ -450,3 +450,27 @@ extern int task_cgroup_memory_attach_task(slurmd_job_t *job)
 
 	return fstatus;
 }
+
+extern int task_cgroup_memory_check_oom(slurmd_job_t *job) {
+	xcgroup_t memory_cg;
+	uint64_t memory_memsw_failcnt;
+	size_t memory_memsw_failcnt_size;
+
+	if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) {
+		if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) {
+			/* for some reason the job cgroup limit is hit for a step and vice versa...
+			 * can't tell which is which so we'll treat them the same */
+			xcgroup_get_uint64_param(&step_memory_cg, "memory.memsw.failcnt", &memory_memsw_failcnt);
+			if(memory_memsw_failcnt > 0)
+				error("Exceeded job memory limit at some point. oom-killer likely killed a process.");
+			xcgroup_get_uint64_param(&job_memory_cg, "memory.memsw.failcnt", &memory_memsw_failcnt);
+			if(memory_memsw_failcnt > 0)
+				error("Exceeded job memory limit at some point. oom-killer likely killed a process.");
+			xcgroup_unlock(&memory_cg);
+		} else
+			error("task/cgroup task_cgroup_memory_check_oom: task_cgroup_memory_check_oom: unable to lock root memcg : %m");
+		xcgroup_destroy(&memory_cg);
+	} else
+		error("task/cgroup task_cgroup_memory_check_oom: unable to create root memcg : %m");
+	return SLURM_SUCCESS;
+}
diff --git a/src/plugins/task/cgroup/task_cgroup_memory.h b/src/plugins/task/cgroup/task_cgroup_memory.h
index bda89804865..a631e626a49 100644
--- a/src/plugins/task/cgroup/task_cgroup_memory.h
+++ b/src/plugins/task/cgroup/task_cgroup_memory.h
@@ -55,4 +55,6 @@ extern int task_cgroup_memory_create(slurmd_job_t *job);
 /* create a task cgroup and attach the task to it */
 extern int task_cgroup_memory_attach_task(slurmd_job_t *job);
 
+extern int task_cgroup_memory_check_oom(slurmd_job_t *job);
+
 #endif
-- 
GitLab