diff --git a/NEWS b/NEWS index ae3fb6c267a17b44484b6f6ad4576f766a85cbf3..a66b05fb509c2c0805a0d2a2b6f9720a58821fc7 100644 --- a/NEWS +++ b/NEWS @@ -95,6 +95,10 @@ documents those changes that are of interest to users and administrators. salloc. -- Add scancel -f/--full option to signal all steps including batch script and all of its child processes. + -- Fix salloc -I to accept an argument. + -- Avoid reporting more allocated CPUs than exist on a node. This can be + triggered by resuming a previosly suspended job, resulting in + oversubscription of CPUs. * Changes in Slurm 15.08.1 ========================== diff --git a/doc/html/documentation.shtml b/doc/html/documentation.shtml index 5491b8dff8f334e1a89f39836acfbcc014094f4c..b6c7fe44f8b52785f0c919446f143ac9f30ccf59 100644 --- a/doc/html/documentation.shtml +++ b/doc/html/documentation.shtml @@ -37,6 +37,7 @@ Documentation for other versions of Slurm is distributed with the code</b></p> <ul> <li><a href="quickstart_admin.html">Quick Start Administrator Guide</a></li> <li><a href="accounting.html">Accounting</a></li> +<li><a href="reservations.html">Advanced Resource Reservation Guide</a></li> <li><a href="burst_buffer.html">Burst Buffer Guide</a></li> <li><a href="cgroups.html">Cgroups Guide</a></li> <li><a href="configurator.html">Configuration Tool (Full version)</a></li> @@ -136,6 +137,6 @@ Documentation for other versions of Slurm is distributed with the code</b></p> </li> </ul> -<p style="text-align:center;">Last modified 22 September 2015</p> +<p style="text-align:center;">Last modified 20 October 2015</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/gang_scheduling.shtml b/doc/html/gang_scheduling.shtml index e3856ced55e75c817a82f9d27cef7ba5c8c0e8cd..fb6ef72e5a6247c8ea132446a442f65897a008d2 100644 --- a/doc/html/gang_scheduling.shtml +++ b/doc/html/gang_scheduling.shtml @@ -500,6 +500,14 @@ Note that <I>CR_Core_Memory</I> supports CPU binding, while <I>CR_CPU_Memory</I> does not. </P> -<p style="text-align:center;">Last modified 24 February 2014</p> +<P>Note that manually suspending a job (i.e. "scontrol suspend ...") releases +its CPUs for allocation to other jobs. +Resuming a previously suspended job may result in multiple jobs being +allocated the same CPUs, which could trigger gang scheduling of jobs. +Use of the scancel command to send SIGSTOP and SIGCONT signals would stop a +job without releasing its CPUs for allocaiton to other jobs and would be a +preferable mechanism in many cases.</P> + +<p style="text-align:center;">Last modified 20 October 2015</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/preempt.shtml b/doc/html/preempt.shtml index e7a6a60fd8cc36f28ae69f0ba5be3a87e965d08f..5bca8a66cce806131feb85f464b3404c9d2da06c 100644 --- a/doc/html/preempt.shtml +++ b/doc/html/preempt.shtml @@ -102,8 +102,12 @@ priority jobs.</LI> Checkpointed jobs are not automatically restarted. <LI>A value of <I>REQUEUE</I> will requeue (if possible) or kill low priority jobs. Requeued jobs are permitted to be restarted on different resources.</LI> -<LI>A value of <I>SUSPEND</I> will suspend and automatically resume the low -priority jobs. The <I>SUSPEND</I> option must be used with the <I>GANG</I> +<LI>A value of <I>SUSPEND</I> will suspend and resume jobs. +If PreemptType=preempt/partition_prio is configured then a value of +<I>SUSPEND</I> will suspend and automatically resume the low priority jobs. +If PreemptType=preempt/qos is configured, then the jobs sharing resources +will always time slice rather than one job remaining suspended. +The <I>SUSPEND</I> option must be used with the <I>GANG</I> option (e.g. "PreemptMode=SUSPEND,GANG").</LI> <LI>A value of <I>GANG</I> may be used with any of the above values and will execute a module responsible for resuming jobs previously suspended for either @@ -429,6 +433,6 @@ order to support ideal placements such as this, which can quickly complicate the design. Any and all help is welcome here! </P> -<p style="text-align:center;">Last modified 20 July 2015</p> +<p style="text-align:center;">Last modified 20 October 2015</p> <!--#include virtual="footer.txt"--> diff --git a/doc/html/reservations.shtml b/doc/html/reservations.shtml index 9d2215447fff3c684f47e585771392b11775c0ac..a63e47f204ecc933fd6489830723f39cbefe2e90 100644 --- a/doc/html/reservations.shtml +++ b/doc/html/reservations.shtml @@ -1,6 +1,6 @@ <!--#include virtual="header.txt"--> -<h1>Resource Reservation Guide</h1> +<h1>Advanced Resource Reservation Guide</h1> <p>Slurm has the ability to reserve resources for jobs being executed by select users and/or select bank accounts. @@ -422,7 +422,7 @@ considering the initiation of jobs. This will prevent the initiation of some jobs which would complete execution before a reservation given fewer jobs to time-slice with.</p> -<p style="text-align: center;">Last modified 24 June 2015</p> +<p style="text-align: center;">Last modified 20 October 2015</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index abe90ba14b4ca7310864a2f4d7d0bfe3c1074fd8..3060f74e7af4a4b576d975aef65718ea8438e7f4 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -295,7 +295,7 @@ The job_list argument is a comma separated list of job IDs. A held job can be released using scontrol to reset its priority (e.g. "scontrol release <job_id>"). The command accepts the following option: .RS -.TP 12 +.TP \fIState=SpecialExit\fP The "SpecialExit" keyword specifies that the job has to be put in a special state \fBJOB_SPECIAL_EXIT\fP. @@ -303,11 +303,21 @@ The "scontrol show job" command will display the JobState as \fBSPECIAL_EXIT\fP, while the "squeue" command as \fBSE\fP. .RE +.TP \fBresume\fP \fIjob_list\fP Resume a previously suspended job. The job_list argument is a comma separated list of job IDs. Also see \fBsuspend\fR. +\fBNOTE:\fR A suspended job releases its CPUs for allocation to other jobs. +Resuming a previously suspended job may result in multiple jobs being +allocated the same CPUs, which could trigger gang scheduling with some +configurations or severe degradation in performance with other configurations. +Use of the scancel command to send SIGSTOP and SIGCONT signals would stop a +job without releasing its CPUs for allocaiton to other jobs and would be a +preferable mechanism in many cases. +Use with caution. + .TP \fBschedloglevel\fP \fILEVEL\fP Enable or disable scheduler logging. diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index f73e04b870f90b211973fdd6de91a87e41d988aa..592b717bada23b9fbf171553fa76ccd6e2293475 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1650,9 +1650,10 @@ not recommended. preempts jobs by requeuing them (if possible) or canceling them. .TP \fBSUSPEND\fR -preempts jobs by suspending them. -A suspended job will resume execution once the high priority job -preempting it completes. +If PreemptType=preempt/partition_prio is configured then suspend and +automatically resume the low priority jobs. +If PreemptType=preempt/qos is configured, then the jobs sharing resources +will always time slice rather than one job remaining suspended. The \fBSUSPEND\fR may only be used with the \fBGANG\fR option (the gang scheduler module performs the job resume operation). .RE diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 9ea221185b7d981fa7498f5bf73b87de315d1b7e..9d8c7e512ccd6c7087ab29b54e30eb26a0b03b47 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -875,11 +875,10 @@ static int _add_job_to_res(struct job_record *job_ptr, int action) /* Job started or resumed and it's allocated resources * are already in use by some other job. Typically due * to manually resuming a job. */ - error("cons_res: ERROR: job overflow: " + error("cons_res: job overflow: " "could not find idle resources for job %u", job_ptr->job_id); - /* just add the job to the last row for now */ - _add_job_to_row(job, &(p_ptr->row[p_ptr->num_rows-1])); + /* No row available to record this job */ } /* update the node state */ for (i = 0, n = -1; i < select_node_cnt; i++) { @@ -2384,10 +2383,10 @@ extern int select_p_select_nodeinfo_set_all(void) { struct part_res_record *p_ptr; struct node_record *node_ptr = NULL; - int i=0, n=0, start, end; - uint16_t tmp, tmp_16 = 0, tmp_part; + int i, n, start, end; + uint16_t tmp, tmp_part; static time_t last_set_all = 0; - uint32_t node_threads, node_cpus; + uint32_t alloc_cpus, node_cores, node_cpus, node_threads; /* only set this once when the last_node_update is newer than * the last time we set things up. */ @@ -2423,7 +2422,7 @@ extern int select_p_select_nodeinfo_set_all(void) start = cr_get_coremap_offset(n); end = cr_get_coremap_offset(n+1); - tmp_16 = 0; + alloc_cpus = 0; for (p_ptr = select_part_record; p_ptr; p_ptr = p_ptr->next) { if (!p_ptr->row) continue; @@ -2431,20 +2430,27 @@ extern int select_p_select_nodeinfo_set_all(void) for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; - tmp = bit_set_count_range(p_ptr->row[i].row_bitmap, - start, end); + tmp = bit_set_count_range( + p_ptr->row[i].row_bitmap, + start, end); /* Report row with largest CPU count */ tmp_part = MAX(tmp, tmp_part); } - tmp_16 += tmp_part; /* Add CPU counts all parts */ + alloc_cpus += tmp_part; /* Add CPU counts all parts */ } - /* The minimum allocatable unit may a core, so scale - * threads up to the proper CPU count */ - if ((end - start) < node_cpus) - tmp_16 *= node_threads; + node_cores = end - start; + /* Administrator could resume suspended jobs and oversubscribe + * cores, avoid reporting more cores in use than configured */ + if (alloc_cpus > node_cores) + alloc_cpus = node_cores; - nodeinfo->alloc_cpus = tmp_16; + /* The minimum allocatable unit may a core, so scale by thread + * count up to the proper CPU count as needed */ + if (node_cores < node_cpus) + alloc_cpus *= node_threads; + + nodeinfo->alloc_cpus = alloc_cpus; if (select_node_record) { nodeinfo->alloc_memory = select_node_usage[n].alloc_memory; diff --git a/src/salloc/opt.c b/src/salloc/opt.c index c6e5527eb763c911d144ce91259ac6380a514a57..7f7da5a2a2e7911f763a99862801e3bd796b3786 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -725,7 +725,7 @@ void set_options(const int argc, char **argv) {NULL, 0, 0, 0} }; char *opt_string = - "+A:B:c:C:d:D:F:g:hHIJ:kK::L:m:n:N:Op:P:QRsS:t:uU:vVw:W:x:"; + "+A:B:c:C:d:D:F:g:hHI::J:kK::L:m:n:N:Op:P:QRsS:t:uU:vVw:W:x:"; char *pos_delimit; struct option *optz = spank_option_table_create(long_options); diff --git a/src/slurmctld/power_save.c b/src/slurmctld/power_save.c index 156da9a473ebde5751d7e1884887f81aeef7c9cb..5db7f74740d42edca78167d92bfcffb7ffa666be 100644 --- a/src/slurmctld/power_save.c +++ b/src/slurmctld/power_save.c @@ -487,12 +487,12 @@ static int _init_power_config(void) debug("power_save module disabled, SuspendTime < 0"); return -1; } - if (suspend_rate < 1) { - error("power_save module disabled, SuspendRate < 1"); + if (suspend_rate < 0) { + error("power_save module disabled, SuspendRate < 0"); return -1; } - if (resume_rate < 1) { - error("power_save module disabled, ResumeRate < 1"); + if (resume_rate < 0) { + error("power_save module disabled, ResumeRate < 0"); return -1; } if (suspend_prog == NULL) { @@ -609,7 +609,7 @@ extern void start_power_mgr(pthread_t *thread_id) } /* - * init_power_save - Onitialize the power save module. Started as a + * init_power_save - Initialize the power save module. Started as a * pthread. Terminates automatically at slurmctld shutdown time. * Input and output are unused. */