diff --git a/BUILD.NOTES b/BUILD.NOTES index 8601681e62201035907439601805566394d0c7ec..6a78cf62c3a9f72c7545ad0edcc89bea9e89170e 100644 --- a/BUILD.NOTES +++ b/BUILD.NOTES @@ -233,6 +233,8 @@ For memory leaks (for AIX use zerofault, zf; for linux use valgrind) ./slurmctld -Dc >valg.ctld.out 2>&1 - valgrind --tool=memcheck --leak-check=yes --num-callers=8 --leak-resolution=med \ ./slurmd -Dc >valg.slurmd.out 2>&1 (Probably only one one node of cluster) + - valgrind --tool=memcheck --leak-check=yes --num-callers=8 --leak-resolution=med \ + ./slurmdbd -D >valg.dbd.out 2>&1 - Run the regression test. In the globals.local file include: "set enable_memory_leak_debug 1" - Shutdown the daemons using "scontrol shutdown" @@ -243,7 +245,11 @@ For memory leaks (for AIX use zerofault, zf; for linux use valgrind) valgrind backtrace after shutdown. Rebuilding the daemons without the configure option of "--enable-memory-leak-debug" typically prevents the plugin from being unloaded so the symbols will be properly reported. However - many memory leaks will be reported due to not unloading plugins. + many memory leaks will be reported due to not unloading plugins. You will + need to match the call sequence from the first log (with + "--enable-memory-leak-debug") to the second log (without + "--enable-memory-leak-debug" and ignore memory leaks reported that are not + real leaks in the second log) to identify the full code path through plugins. Job profiling: - "export CFLAGS=-pg", then run "configure" and "make install" as usual. diff --git a/NEWS b/NEWS index 23837862d5ed0f11c7bae40167395baae16a1c2e..273eac7ed5ded72222bceff945acc3ba5015908e 100644 --- a/NEWS +++ b/NEWS @@ -99,6 +99,12 @@ documents those changes that are of interest to users and administrators. -- Add reservation flag of "purge_comp" which will purge an advanced reservation once it has no more active (pending, suspended or running) jobs. +* Changes in Slurm 15.08.8 +========================== + -- Backfill scheduling properly synchronized with Cray Node Health Check. + Prior logic could result in highest priority job getting improperly + postponed. + * Changes in Slurm 15.08.7 ========================== -- sched/backfill: If a job can not be started within the configured @@ -175,6 +181,8 @@ documents those changes that are of interest to users and administrators. while there is a suspended job. Previous logic would add the CPUs, but not memory or GPUs. This would result in underflow/overflow errors in select cons_res plugin. + -- Strip flags from a job state in qstat wrapper before evaluating. + -- Add missing job states from the qstat wrapper. * Changes in Slurm 15.08.6 ========================== diff --git a/contribs/perlapi/libslurm/perl/lib/Slurm/Constant.pm b/contribs/perlapi/libslurm/perl/lib/Slurm/Constant.pm index 0dfb197eda32b9adfe2724db971135b25aa30d18..21111ae4b57115b46e05863bb6217837e8505e1d 100644 --- a/contribs/perlapi/libslurm/perl/lib/Slurm/Constant.pm +++ b/contribs/perlapi/libslurm/perl/lib/Slurm/Constant.pm @@ -495,7 +495,11 @@ This package export constants for use with Slurm. This includes enumerations and =item * JOB_NODE_FAIL 7 -=item * JOB_END 8 +=item * JOB_PREEMPTED 8 + +=item * JOB_BOOT_FAIL 9 + +=item * JOB_END 10 =back diff --git a/contribs/torque/qstat.pl b/contribs/torque/qstat.pl index 8d7a2d7ac0055da8cc3a12d746e2cfb4c7227684..a040d6db9e614209738aa288b2b0e9049b8b2701 100755 --- a/contribs/torque/qstat.pl +++ b/contribs/torque/qstat.pl @@ -245,11 +245,13 @@ sub stateCode if(!defined($state)) { return 'U'; } - - switch($state) { + switch($state & JOB_STATE_BASE) { case [JOB_COMPLETE, JOB_CANCELLED, JOB_TIMEOUT, + JOB_NODE_FAIL, + JOB_PREEMPTED, + JOB_BOOT_FAIL, JOB_FAILED] { return 'C' } case [JOB_RUNNING] { return 'R' } case [JOB_PENDING] { return 'Q' } diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index a2e4edaa483f189017e99b2979d99f26598bc094..9c49e57bf1f5db27863e3eede4548509e6cda642 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -1713,6 +1713,22 @@ static time_t _guess_job_end(struct job_record * job_ptr, time_t now) return end_time; } +/* Return TRUE if job is in the processing of cleaning up. + * This is used for Cray systems to indicate the Node Health Check (NHC) + * is still running. Until NHC completes, the job's resource use persists + * the select/cons_res plugin data structures. */ +static bool _job_cleaning(struct job_record *job_ptr) +{ + uint16_t cleaning = 0; + + select_g_select_jobinfo_get(job_ptr->select_jobinfo, + SELECT_JOBDATA_CLEANING, + &cleaning); + if (cleaning) + return true; + return false; +} + /* _will_run_test - determine when and where a pending job can start, removes * jobs from node table at termination time and run _test_job() after * each one. Used by SLURM's sched/backfill plugin and Moab. */ @@ -1778,7 +1794,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, job_iterator = list_iterator_create(job_list); while ((tmp_job_ptr = (struct job_record *) list_next(job_iterator))) { if (!IS_JOB_RUNNING(tmp_job_ptr) && - !IS_JOB_SUSPENDED(tmp_job_ptr)) + !IS_JOB_SUSPENDED(tmp_job_ptr) && + !_job_cleaning(tmp_job_ptr)) continue; if (tmp_job_ptr->end_time == 0) { error("Job %u has zero end_time", tmp_job_ptr->job_id);