diff --git a/NEWS b/NEWS index d627133e4d05914ad2dad21c412563ad4a4e6c7a..e6314aef8982428173e01ba95fa8df36b9552c36 100644 --- a/NEWS +++ b/NEWS @@ -90,6 +90,7 @@ documents those changes that are of interest to users and administrators. NOTE: Job arrays submitted to Slurm version 15.08.6 or later will fail if the slurmctld daemon is downgraded to an earlier version of Slurm. -- Move slurmctld mail handler to separate thread for improved performance. + -- Fix containment of adopted processes from pam_slurm_adopt. * Changes in Slurm 15.08.5 ========================== diff --git a/doc/man/man1/sdiag.1 b/doc/man/man1/sdiag.1 index f7c2fc6721acd75cb6712ac572b4979f1de4cabc..42dd620f8220de923aaaa152020f22f6bc05ec71 100644 --- a/doc/man/man1/sdiag.1 +++ b/doc/man/man1/sdiag.1 @@ -6,7 +6,7 @@ sdiag \- Scheduling diagnostic tool for Slurm .SH "SYNOPSIS" .LP -sview +sdiag .SH "DESCRIPTION" .LP @@ -206,7 +206,7 @@ Sort Remote Procedure Call (RPC) data by message type ID and user ID. .TP \fB\-r\fR, \fB\-\-reset\fR -Reset counters. Only supported for Slurm operators and administers. +Reset counters. Only supported for Slurm operators and administrators. .TP \fB\-t\fR, \fB\-\-sort\-by\-time\fR diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index 714ed93b868a2e42913fc8535491127ca1a140ec..83c07476503eb9fa144de451bfd972e0dd3c0934 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -1238,7 +1238,6 @@ static void *_wait_extern_pid(void *args) pid_t pid = extern_pid->pid; jobacctinfo_t *jobacct = NULL; - jobacct_id_t jobacct_id; pid_t *pids = NULL; int npids = 0, i; char proc_stat_file[256]; /* Allow ~20x extra length */ @@ -1249,14 +1248,6 @@ static void *_wait_extern_pid(void *args) xfree(extern_pid); - jobacct_id.taskid = job->nodeid; - jobacct_id.nodeid = job->nodeid; - jobacct_id.job = job; - - proctrack_g_add(job, pid); - task_g_add_pid(pid); - jobacct_gather_add_task(pid, &jobacct_id, 1); - //info("waiting on pid %d", pid); _block_on_pid(pid); //info("done with pid %d %d: %m", pid, rc); @@ -1307,6 +1298,7 @@ static int _handle_add_extern_pid_internal(stepd_step_rec_t *job, pid_t pid) pthread_t thread_id; pthread_attr_t attr; extern_pid_t *extern_pid; + jobacct_id_t jobacct_id; int retries = 0, rc = SLURM_SUCCESS; if (job->stepid != SLURM_EXTERN_CONT) { @@ -1322,6 +1314,17 @@ static int _handle_add_extern_pid_internal(stepd_step_rec_t *job, pid_t pid) extern_pid->job = job; extern_pid->pid = pid; + /* track pid: add outside of the below thread so that the pam module + * waits until the parent pid is added, before letting the parent spawn + * any children. */ + jobacct_id.taskid = job->nodeid; + jobacct_id.nodeid = job->nodeid; + jobacct_id.job = job; + + proctrack_g_add(job, pid); + task_g_add_pid(pid); + jobacct_gather_add_task(pid, &jobacct_id, 1); + /* spawn a thread that will wait on the pid given */ slurm_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am index 048678cc3e81c75e9fd642061e5b3c8493861ddb..10b96864689e290a7cd48082e64a107e7766d371 100644 --- a/testsuite/expect/Makefile.am +++ b/testsuite/expect/Makefile.am @@ -522,6 +522,7 @@ EXTRA_DIST = \ inc21.30.14 \ inc21.30.15 \ inc21.30.16 \ + inc21.30.17 \ inc21.34.1 \ inc21.34.2 \ inc21.34_test \ diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in index c40970b1669ff139703465723208ed7777baeb38..a661a8b8b51434e8278b3750487e66c55c50b6ff 100644 --- a/testsuite/expect/Makefile.in +++ b/testsuite/expect/Makefile.in @@ -923,6 +923,7 @@ EXTRA_DIST = \ inc21.30.14 \ inc21.30.15 \ inc21.30.16 \ + inc21.30.17 \ inc21.34.1 \ inc21.34.2 \ inc21.34_test \ diff --git a/testsuite/expect/inc21.34_tests b/testsuite/expect/inc21.34_tests index 3615af9b875b377da7c279565a0d9fd2383bdc40..4bd8c3e8729779f7fe4b1e4d7e8b714d7aa25fde 100644 --- a/testsuite/expect/inc21.34_tests +++ b/testsuite/expect/inc21.34_tests @@ -40,6 +40,7 @@ source ./inc21.30.13 source ./inc21.30.14 source ./inc21.30.15 source ./inc21.30.16 +source ./inc21.30.17 ########################################################## # @@ -321,6 +322,23 @@ proc part_test { } { } set mod_job_qos(MaxNodesPerUser) "-1" set mod_part_qos(MaxNodesPerUser) "-1" + + # + # QOS/Parent QOS MaxWall + # + set mod_job_qos(MaxWall) 1 + set mod_part_qos(MaxWall) $maxwall_num + mod_qos $part_qos [array get mod_part_qos] + mod_qos $job_qos [array get mod_job_qos] + sleep $time_spacing + inc21_30_17 + if {$exit_code != 0 } { + cleanup + exit 1 + } + set mod_job_qos(MaxWall) "-1" + set mod_part_qos(MaxWall) "-1" + } ########################################################## @@ -607,4 +625,20 @@ proc qos_test { } { set mod_job_qos(MaxNodesPerUser) "-1" set mod_part_qos(MaxNodesPerUser) "-1" + # + # QOS/Parent QOS MaxWall + # + set mod_job_qos(MaxWall) $maxwall_num + set mod_part_qos(MaxWall) 1 + mod_qos $part_qos [array get mod_part_qos] + mod_qos $job_qos [array get mod_job_qos] + sleep $time_spacing + inc21_30_17 + if {$exit_code != 0 } { + cleanup + exit 1 + } + set mod_job_qos(MaxWall) "-1" + set mod_part_qos(MaxWall) "-1" + } diff --git a/testsuite/expect/test21.30 b/testsuite/expect/test21.30 index 4f1553dc0f2dd0475a6d91c3733de2f26b1eebcb..8da3e1fedb797f440f6700229f327f2905b9bad6 100755 --- a/testsuite/expect/test21.30 +++ b/testsuite/expect/test21.30 @@ -46,6 +46,7 @@ source ./inc21.30.13 source ./inc21.30.14 source ./inc21.30.15 source ./inc21.30.16 +source ./inc21.30.17 set test_id "21.30" set exit_code 0 @@ -636,4 +637,17 @@ if {$exit_code != 0 } { } set mod_qos_vals(MaxNodesPerUser) "-1" +# +# Test MaxWall is used as job's timelimit if job was requested +# without --time option +# +set mod_qos_vals(MaxWall) $maxwall_num +mod_qos $qostest [array get mod_qos_vals] +sleep $time_spacing +inc21_30_17 +if {$exit_code != 0 } { + endit +} +set mod_qos_vals(MaxWall) "-1" + endit