From fcffea8ca3ff9a2e97f20900eef190f004582a6e Mon Sep 17 00:00:00 2001
From: Matthieu Hautreux <matthieu.hautreux@cea.fr>
Date: Mon, 22 Oct 2012 13:14:15 -0700
Subject: [PATCH] Task/cgroup memory logic now try to manually remove the cg
 dirs hierarchy

In order to be more deterministic in the removal of the different layers
of memcg involved when a step is running, do it manually by first acquiring
a lock on the root memcg to protect concurrent creations of step memcg from
invalid intermediate memcg directories in their own hierarchies.
---
 src/plugins/task/cgroup/task_cgroup_memory.c | 63 ++++++++++++++------
 1 file changed, 44 insertions(+), 19 deletions(-)

diff --git a/src/plugins/task/cgroup/task_cgroup_memory.c b/src/plugins/task/cgroup/task_cgroup_memory.c
index 06c7979de11..ad8ae53abe5 100644
--- a/src/plugins/task/cgroup/task_cgroup_memory.c
+++ b/src/plugins/task/cgroup/task_cgroup_memory.c
@@ -161,16 +161,43 @@ extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
 	     job_cgroup_path[0] == '\0' ||
 	     jobstep_cgroup_path[0] == '\0')
 		return SLURM_SUCCESS;
-
 	/*
-	 * Delete the step memory cgroup as all the tasks have now exited
-	 * The job memory cgroup will be removed by the release agent
-	 * if possible (no other step running).
-	 * The user memory cgroup will be removed by the release agent
-	 * when possible too (no other job running).
+	 * Lock the root memcg and try to remove the different memcgs.
+	 * The reason why we are locking here is that if a concurrent
+	 * step is in the process of being executed, he could try to
+	 * create the step memcg just after we remove the job memcg,
+	 * resulting in a failure.
+	 * First, delete step memcg as all the tasks have now exited.
+	 * Then, try to remove the job memcg.
+	 * If it fails, it is due to the fact that it is still in use by an
+	 * other running step.
+	 * After that, try to remove the user memcg. If it fails, it is due
+	 * to jobs that are still running for the same user on the node or 
+	 * because of tasks attached directly to the user cg by an other
+	 * component (PAM). The user memcg was created with the
+	 * notify_on_release=1 flag (default) so it will be removed 
+	 * automatically after that.
+	 * For now, do not try to detect if only externally attached tasks
+	 * are present to see if they can be be moved to an orhpan memcg. 
+	 * That could be done in the future, if it is necessary.
 	 */
-	if (xcgroup_delete(&step_memory_cg) != SLURM_SUCCESS)
-		error("task/cgroup: unable to remove step memcg : %m");
+	if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) {
+		if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) {
+			if (xcgroup_delete(&step_memory_cg) != SLURM_SUCCESS)
+				error("task/cgroup: unable to remove step "
+				      "memcg : %m");
+			if (xcgroup_delete(&job_memory_cg) != XCGROUP_SUCCESS)
+				info("task/cgroup: not removing "
+				     "job memcg : %m");
+			if (xcgroup_delete(&user_memory_cg) != XCGROUP_SUCCESS)
+				info("task/cgroup: not removing "
+				     "user memcg : %m");
+			xcgroup_unlock(&memory_cg);
+		} else
+			error("task/cgroup: unable to lock root memcg : %m");
+		xcgroup_destroy(&memory_cg);
+	} else
+		error("task/cgroup: unable to create root memcg : %m");
 
 	xcgroup_destroy(&user_memory_cg);
 	xcgroup_destroy(&job_memory_cg);
@@ -229,7 +256,8 @@ static uint64_t swap_limit_in_bytes (uint64_t mem)
 }
 
 static int memcg_initialize (xcgroup_ns_t *ns, xcgroup_t *cg,
-		char *path, uint64_t mem_limit, uid_t uid, gid_t gid)
+			     char *path, uint64_t mem_limit, uid_t uid,
+			     gid_t gid, uint32_t notify)
 {
 	uint64_t mlb = mem_limit_in_bytes (mem_limit);
 	uint64_t mls = swap_limit_in_bytes  (mem_limit);
@@ -237,6 +265,8 @@ static int memcg_initialize (xcgroup_ns_t *ns, xcgroup_t *cg,
 	if (xcgroup_create (ns, cg, path, uid, gid) != XCGROUP_SUCCESS)
 		return -1;
 
+	cg->notify = notify; 
+
 	if (xcgroup_instanciate (cg) != XCGROUP_SUCCESS) {
 		xcgroup_destroy (cg);
 		return -1;
@@ -375,9 +405,11 @@ extern int task_cgroup_memory_create(slurmd_job_t *job)
 	/*
 	 * Create job cgroup in the memory ns (it could already exist)
 	 * and set the associated memory limits.
+	 * Disable notify_on_release for this memcg, it will be
+	 * manually removed by the plugin at the end of the step.
 	 */
 	if (memcg_initialize (&memory_ns, &job_memory_cg, job_cgroup_path,
-	                      job->job_mem, getuid(), getgid()) < 0) {
+	                      job->job_mem, getuid(), getgid(), 0) < 0) {
 		xcgroup_destroy (&user_memory_cg);
 		goto error;
 	}
@@ -385,22 +417,15 @@ extern int task_cgroup_memory_create(slurmd_job_t *job)
 	/*
 	 * Create step cgroup in the memory ns (it should not exists)
 	 * and set the associated memory limits.
-	 * Then disable notify_on_release for the step memcg, it will be
+	 * Disable notify_on_release for the step memcg, it will be
 	 * manually removed by the plugin at the end of the step.
 	 */
 	if (memcg_initialize (&memory_ns, &step_memory_cg, jobstep_cgroup_path,
-	                      job->step_mem, uid, gid) < 0) {
+	                      job->step_mem, uid, gid, 0) < 0) {
 		xcgroup_destroy(&user_memory_cg);
 		xcgroup_destroy(&job_memory_cg);
 		goto error;
 	}
-	if (xcgroup_set_params(&step_memory_cg, "notify_on_release=0")
-	    != XCGROUP_SUCCESS) {
-		/* treat that error as a warning as the release agent would
-		 * purge the memcg in that case */
-		error("task/cgroup: unable to disable notify_on_release of "
-		      "step memcg '%s'",step_memory_cg.path);
-	}
 
 error:
 	xcgroup_unlock(&memory_cg);
-- 
GitLab