From eeec6cd25280b71a284f51cb3a890f410ade26fb Mon Sep 17 00:00:00 2001 From: Carlos Tripiana Montes <tripiana@schedmd.com> Date: Thu, 15 Apr 2021 17:07:06 +0200 Subject: [PATCH] job_container/tmpfs: add functionality to restore NSs state after restart container_p_restore get now the list of jobs running from the spool dir with stepd_available. Then, it iterates over basepath entries and, for those which seems to have been a mount point (has .ns file), tries to mount it again. If it succeeds (it must), and if for this mount point the job is dead, it releases resources and tries to delete files. Remember the removal can fail if a resource is leaked. These would be fixed if slurmd starts after HW reboot (no kernel leaks). Bug 11093 --- .../job_container/tmpfs/job_container_tmpfs.c | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/src/plugins/job_container/tmpfs/job_container_tmpfs.c b/src/plugins/job_container/tmpfs/job_container_tmpfs.c index 16635372064..d67a1d87d78 100644 --- a/src/plugins/job_container/tmpfs/job_container_tmpfs.c +++ b/src/plugins/job_container/tmpfs/job_container_tmpfs.c @@ -58,6 +58,9 @@ #include "read_jcconf.h" +static int _create_ns(uint32_t job_id, bool remount); +static int _delete_ns(uint32_t job_id); + #if defined (__APPLE__) extern slurmd_conf_t *conf __attribute__((weak_import)); #else @@ -71,6 +74,7 @@ const uint32_t plugin_version = SLURM_VERSION_NUMBER; static slurm_jc_conf_t *jc_conf = NULL; static int step_ns_fd = -1; static bool force_rm = true; +static List running_job_ids = NULL; static int _create_paths(uint32_t job_id, char *job_mount, @@ -125,6 +129,70 @@ static int _create_paths(uint32_t job_id, return SLURM_SUCCESS; } +static int _find_job_id_in_list(uint32_t *list_job_id, uint32_t *job_id) +{ + return (*list_job_id == *job_id); +} + +static int _append_job_in_list(void *element, void *arg) +{ + step_loc_t *stepd = (step_loc_t *) element; + List job_id_list = (List) arg; + + xassert(job_id_list); + + if (!list_find_first(job_id_list, (ListFindF)_find_job_id_in_list, + &stepd->step_id.job_id)) { + if (stepd_connect(stepd->directory, + stepd->nodename, + &stepd->step_id, + &stepd->protocol_version) != -1) + list_append(job_id_list, &stepd->step_id.job_id); + } + + return SLURM_SUCCESS; +} + +static int _restore_ns(const char *path, const struct stat *st_buf, int type) +{ + int rc = SLURM_SUCCESS; + uint32_t job_id; + char ns_holder[PATH_MAX]; + struct stat stat_buf; + + if (type == FTW_NS) { + error("%s: Unreachable file of FTW_NS type: %s", + __func__, path); + rc = SLURM_ERROR; + } else if (type == FTW_DNR) { + error("%s: Unreadable directory: %s", __func__, path); + rc = SLURM_ERROR; + } else if (type == FTW_D && xstrcmp(jc_conf->basepath, path)) { + /* Lookup for .ns file inside. If exists, try to restore. */ + if (snprintf(ns_holder, PATH_MAX, "%s/.ns", path) >= PATH_MAX) { + error("%s: Unable to build ns_holder path %s: %m", + __func__, ns_holder); + rc = SLURM_ERROR; + } else if (stat(ns_holder, &stat_buf) < 0) { + debug3("%s: ignoring wrong ns_holder path %s: %m", + __func__, ns_holder); + } else { + job_id = slurm_atoul(&(xstrrchr(path, '/')[1])); + /* At this point we can remount the folder. */ + if (_create_ns(job_id, true)) { + rc = SLURM_ERROR; + /* And then, properly delete it for dead jobs. */ + } else if (!list_find_first( + running_job_ids, + (ListFindF)_find_job_id_in_list, + &job_id)) { + rc = _delete_ns(job_id); + } + } + } + return rc; +} + extern void container_p_reconfig(void) { return; @@ -181,6 +249,8 @@ extern int fini(void) extern int container_p_restore(char *dir_name, bool recover) { + List steps; + #ifdef HAVE_NATIVE_CRAY return SLURM_SUCCESS; #endif @@ -236,6 +306,10 @@ extern int container_p_restore(char *dir_name, bool recover) } + /* It could fail if no leaks, it can clean as much leaks as possible. */ + if (umount2(jc_conf->basepath, MNT_DETACH)) + debug2("umount2: %s failed: %s", jc_conf->basepath, strerror(errno)); + #if !defined(__APPLE__) && !defined(__FreeBSD__) /* * MS_BIND mountflag would make mount() ignore all other mountflags @@ -258,6 +332,27 @@ extern int container_p_restore(char *dir_name, bool recover) #endif debug3("tmpfs: Base namespace created"); + steps = stepd_available(conf->spooldir, conf->node_name); + running_job_ids = list_create(NULL); + + /* Iterate over steps, and check once per job if it's still running. */ + (void)list_for_each(steps, _append_job_in_list, running_job_ids); + FREE_NULL_LIST(steps); + + /* + * Iterate over basepath, restore only the folders that seem bounded to + * real jobs (have .ns file). NOTE: Restoring the state could be either + * deleting the folder if the job is died and resources are free, or + * mount it otherwise. + */ + if (ftw(jc_conf->basepath, _restore_ns, 64)) { + error("%s: Directory traversal failed: %s: %s", + __func__, jc_conf->basepath, strerror(errno)); + FREE_NULL_LIST(running_job_ids); + return SLURM_ERROR; + } + FREE_NULL_LIST(running_job_ids); + return SLURM_SUCCESS; } -- GitLab