diff --git a/NEWS b/NEWS index 68b5b1e1acc394da64d6334082993de5d3b94f22..2d979088956cc12d253ae81d8f28ae5abed86242 100644 --- a/NEWS +++ b/NEWS @@ -18,6 +18,7 @@ documents those changes that are of interest to users and admins. -- Correct a minore error in the scancel.1 man page related to the --signal option. -- Enhance the scancel.1 man page to document the sequence of signals sent + -- Fix slurmstepd core dump if the cgroup hierarchy is not completed. when terminating the job. -- Fix hostlist_shift to be able to give correct node names on names with a different number of dimensions than the cluster. diff --git a/src/common/xcgroup.c b/src/common/xcgroup.c index d2fb81afe3c673d7f42c99679da39ac715e032f1..b2d20e2e6093fe768401143d708ef255d6355656 100644 --- a/src/common/xcgroup.c +++ b/src/common/xcgroup.c @@ -468,8 +468,8 @@ int xcgroup_instanciate(xcgroup_t* cg) /* build cgroup */ if (mkdir(file_path, 0755)) { if (create_only || errno != EEXIST) { - debug2("unable to create cgroup '%s' : %m", - file_path); + debug2("%s: unable to create cgroup '%s' : %m", + __func__, file_path); umask(omask); return fstatus; } diff --git a/src/plugins/jobacct_gather/cgroup/jobacct_gather_cgroup.c b/src/plugins/jobacct_gather/cgroup/jobacct_gather_cgroup.c index dcc1ba261470d5e11dc3bdb098d32ffb02230273..822725d3e8b4afb3d6daaa9af53dfcb1820b6d12 100644 --- a/src/plugins/jobacct_gather/cgroup/jobacct_gather_cgroup.c +++ b/src/plugins/jobacct_gather/cgroup/jobacct_gather_cgroup.c @@ -108,33 +108,44 @@ static void _prec_extra(jag_prec_t *prec) char *cpu_time = NULL, *memory_stat = NULL, *ptr; size_t cpu_time_size = 0, memory_stat_size = 0; - //DEF_TIMERS; //START_TIMER; /* info("before"); */ /* print_jag_prec(prec); */ xcgroup_get_param(&task_cpuacct_cg, "cpuacct.stat", &cpu_time, &cpu_time_size); - sscanf(cpu_time, "%*s %lu %*s %lu", &utime, &stime); - prec->usec = utime; - prec->ssec = stime; + if (cpu_time == NULL) { + error("%s: failed to collect cpuacct.stat pid %d ppid %d", + __func__, prec->pid, prec->ppid); + } else { + sscanf(cpu_time, "%*s %lu %*s %lu", &utime, &stime); + prec->usec = utime; + prec->ssec = stime; + } + xcgroup_get_param(&task_memory_cg, "memory.stat", &memory_stat, &memory_stat_size); - /* This number represents the amount of "dirty" private memory - used by the cgroup. From our experience this is slightly - different than what proc presents, but is probably more - accurate on what the user is actually using. - */ - ptr = strstr(memory_stat, "total_rss"); - sscanf(ptr, "total_rss %lu", &total_rss); - prec->rss = total_rss / 1024; /* convert from bytes to KB */ - - /* total_pgmajfault is what is reported in proc, so we use - * the same thing here. */ - if ((ptr = strstr(memory_stat, "total_pgmajfault"))) { - sscanf(ptr, "total_pgmajfault %lu", &total_pgpgin); - prec->pages = total_pgpgin; + if (memory_stat == NULL) { + error("%s: failed to collect memory.stat pid %d ppid %d", + __func__, prec->pid, prec->ppid); + } else { + /* This number represents the amount of "dirty" private memory + used by the cgroup. From our experience this is slightly + different than what proc presents, but is probably more + accurate on what the user is actually using. + */ + ptr = strstr(memory_stat, "total_rss"); + sscanf(ptr, "total_rss %lu", &total_rss); + prec->rss = total_rss / 1024; /* convert from bytes to KB */ + + /* total_pgmajfault is what is reported in proc, so we use + * the same thing here. */ + if ((ptr = strstr(memory_stat, "total_pgmajfault"))) { + sscanf(ptr, "total_pgmajfault %lu", &total_pgpgin); + prec->pages = total_pgpgin; + } } + xfree(cpu_time); xfree(memory_stat);