From 0b97809f0c0017ab9da4568b3a41b42e6546a8b0 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 11 Apr 2008 22:09:57 +0000
Subject: [PATCH] Add preeption web page

---
 doc/html/Makefile.am            |   1 +
 doc/html/Makefile.in            |   1 +
 doc/html/documentation.shtml    |   1 +
 doc/html/preempt.shtml          | 247 ++++++++++++++++++++++++++++++++
 doc/html/quickstart_admin.shtml |   3 +-
 5 files changed, 252 insertions(+), 1 deletion(-)
 create mode 100644 doc/html/preempt.shtml

diff --git a/doc/html/Makefile.am b/doc/html/Makefile.am
index aeb97576fc2..6b9fae58332 100644
--- a/doc/html/Makefile.am
+++ b/doc/html/Makefile.am
@@ -31,6 +31,7 @@ generated_html = \
 	platforms.html \
 	plugins.html \
 	power_save.html \
+	preempt.html \
 	proctrack_plugins.html \
 	programmer_guide.html \
 	publications.html \
diff --git a/doc/html/Makefile.in b/doc/html/Makefile.in
index 858d0527e03..d1dba79b190 100644
--- a/doc/html/Makefile.in
+++ b/doc/html/Makefile.in
@@ -271,6 +271,7 @@ generated_html = \
 	platforms.html \
 	plugins.html \
 	power_save.html \
+	preempt.html \
 	proctrack_plugins.html \
 	programmer_guide.html \
 	publications.html \
diff --git a/doc/html/documentation.shtml b/doc/html/documentation.shtml
index 4f3d8c5a130..7422b0c5524 100644
--- a/doc/html/documentation.shtml
+++ b/doc/html/documentation.shtml
@@ -29,6 +29,7 @@
 <li><a href="cons_res_share.shtml">Sharing Consumable Resources</a></li>
 <li><a href="accounting.shtml">Accounting</a></li>
 <li><a href="gang_scheduling.shtml">Gang Scheduling</a></li>
+<li><a href="preempt.shtml">Preemption</a></li>
 <li><a href="maui.shtml">Maui Scheduler Integration Guide</a></li>
 <li><a href="moab.shtml">Moab Cluster Suite Integration Guide</a></li>
 <li><a href="http://docs.hp.com/en/5991-4847/ch09s02.html">Submitting Jobs throuh LSF</a></li>
diff --git a/doc/html/preempt.shtml b/doc/html/preempt.shtml
new file mode 100644
index 00000000000..d58acf00324
--- /dev/null
+++ b/doc/html/preempt.shtml
@@ -0,0 +1,247 @@
+<!--#include virtual="header.txt"-->
+
+<H1>Preemption</H1>
+
+<P>
+SLURM version 1.2 and earlier supported dedication of resources
+to jobs based on a simple "first come, first served" policy with backfill.
+Beginning in SLURM version 1.3, priority-based <I>preemption</I> is supported. 
+Preemption is the act of suspending one or more "low-priority" jobs to let a
+"high-priority" job run uninterrupted until it completes. Preemption provides
+the ability to prioritize the workload on a cluster.
+</P>
+<P>
+The SLURM version 1.3.1 <I>sched/gang</I> plugin supports preemption. 
+When configured, 
+the plugin monitors each of the partitions in SLURM. If a new job in a
+high-priority partition has been allocated to resources that have already been
+allocated to one or more existing jobs from lower priority partitions, the
+plugin respects the partition priority and suspends the low-priority job(s). The
+low-priority job(s) remain suspended until the job from the high-priority
+partition completes. Once the high-priority job completes then the low-priority
+job(s) are resumed.
+</P>
+
+<H2>Configuration</H2>
+<P>
+There are several important configuration parameters relating to preemption:
+</P>
+<UL>
+<LI>
+<B>SelectType</B>: The SLURM <I>sched/gang</I> plugin supports nodes 
+allocated by the <I>select/linear</I> plugin and socket/core/CPU resources 
+allocated by the <I>select/cons_res</I> plugin. 
+See <A HREF="#future_work">Future Work</A> below for more
+information on "preemption with consumable resources".
+</LI>
+<LI>
+<B>SelectTypeParameter</B>: Since resources will be getting overallocated 
+with jobs (the preempted job will remain in memory), the resource selection
+plugin should be configured to track the amount of memory used by each job to
+ensure that memory page swapping does not occur. When <I>select/linear</I> is
+chosen, we recommend setting <I>SelectTypeParameter=CR_Memory</I>. When
+<I>select/cons_res</I> is chosen, we recommend including Memory as a resource
+(ex. <I>SelectTypeParameter=CR_Core_Memory</I>).
+</LI>
+<LI>
+<B>DefMemPerTask</B>: Since job requests may not explicitly specify 
+a memory requirement, we also recommend configuring <I>DefMemPerTask</I> 
+(default memory per task). It may also be desirable to configure 
+<I>MaxMemPerTask</I> (maximum memory per task) in <I>slurm.conf</I>.
+</LI>
+<LI>
+<B>JobAcctGatherType and JobAcctGatherFrequency</B>:
+If you wish to enforce memory limits, accounting must be enabled
+using the <I>JobAcctGatherType</I> and <I>JobAcctGatherFrequency</I>
+parameters. If accounting is enabled and a job exceeds its configured
+memory limits, it will be canceled in order to prevent it from 
+adversely effecting other jobs sharing the same resources.
+</LI>
+<LI>
+<B>SchedulerType</B>: Configure the <I>sched/gang</I> plugin by setting
+<I>SchedulerType=sched/gang</I> in <I>slurm.conf</I>.
+</LI>
+<LI>
+<B>Priority</B>: Configure the partition's <I>Priority</I> setting relative to
+other partitions to control the preemptive behavior. If two jobs from two
+different partitions are allocated to the same resources, the job in the
+partition with the greater <I>Priority</I> value will preempt the job in the
+partition with the lesser <I>Priority</I> value. If the <I>Priority</I> values
+of the two partitions are equal then no preemption will occur, and the two jobs
+will run simultaneously on the same resources. The default <I>Priority</I> value
+is 1.
+</LI>
+<LI>
+<B>Shared</B>: Configure the partitions <I>Shared</I> setting to 
+<I>FORCE</I> for all partitions that will preempt or that will be preempted. The
+<I>FORCE</I> setting is required to enable the select plugins to overallocate
+resources. Jobs submitted to a partition that does not share it's resources will
+not preempt other jobs, nor will those jobs be preempted. Instead those jobs
+will wait until the resources are free for non-shared use by each job.
+<BR>
+The <I>FORCE</I> option now supports an additional parameter that controls 
+how many jobs can share a resource within the partition (FORCE[:max_share]). By
+default the max_share value is 4. To disable timeslicing within a partition but
+enable preemption with other partitions, set <I>Shared=FORCE:1</I>.
+</LI>
+<LI>
+<B>SchedulerTimeSlice</B>: The default timeslice interval is 30 seconds. 
+To change this duration, set <I>SchedulerTimeSlice</I> to the desired interval 
+(in seconds) in <I>slurm.conf</I>. For example, to set the timeslice interval 
+to one minute, set <I>SchedulerTimeSlice=60</I>. Short values can increase 
+the overhead of gang scheduling. This parameter is only relevant if timeslicing
+within a partition will be configured. Preemption and timeslicing can occur at
+the same time.
+</LI>
+</UL>
+<P>
+To enable preemption after making the configuration changes described above,
+restart SLURM if it is already running. Any change to the plugin settings in
+SLURM requires a full restart of the daemons. If you just change the partition
+<I>Priority</I> or <I>Shared</I> setting, this can be updated with

+<I>scontrol reconfig</I>.
+</P>
+
+<H2>Preemption Design and Operation</H2>

+
+<P>
+When enabled, the <I>sched/gang</I> plugin keeps track of the resources
+allocated to all jobs. For each partition an "active bitmap" is maintained that
+tracks all concurrently running jobs in the SLURM cluster. Each partition also
+maintains a job list for that partition, and a list of "shadow" jobs. These
+"shadow" jobs are running jobs from higher priority partitions that "cast
+shadows" on the active bitmaps of the lower priority partitions. 
+</P>
+<P>
+Each time a new job is allocated to resources in a partition and begins running,
+the <I>sched/gang</I> plugin adds a "shadow" of this job to all lower priority
+partitions. The active bitmap of these lower priority partitions are then
+rebuilt, with the shadow jobs added first. Any existing jobs that were replaced
+by one or more "shadow" jobs are suspended (preempted). Conversely, when a 
+high-priority running job completes, it's "shadow" goes away and the active 
+bitmaps of the lower priority partitions are rebuilt to see if any suspended 
+jobs can be resumed.
+</P>
+<P>
+The gang scheduler plugin is primarily designed to be <I>reactive</I> to the
+resource allocation decisions made by the Selector plugins. This is why
+<I>Shared=FORCE</I> is required in each partition. The <I>Shared=FORCE</I>
+setting enables the <I>select/linear</I> and <I>select/cons_res</I> plugins to
+overallocate the resources between partitions. This keeps all of the node
+placement logic in the <I>select</I> plugins, and leaves the gang scheduler in
+charge of controlling which jobs should run on the overallocated resources. 
+</P>
+<P>
+The <I>sched/gang</I> plugin suspends jobs via the same internal functions that
+support <I>scontrol suspend</I> and <I>scontrol resume</I>. A good way to
+observe the act of preemption is by running <I>watch squeue</I> in a terminal
+window.
+</P>
+
+<H2>A Simple Example</H2>
+
+<P>
+The following example is configured with <I>select/linear</I>,
+<I>sched/gang</I>, and <I>Shared=FORCE:1</I>. This example takes place on a
+cluster of 5 nodes:
+</P>
+<PRE>
+[user@n16 ~]$ <B>sinfo</B>
+PARTITION AVAIL  TIMELIMIT NODES  STATE NODELIST
+active*      up   infinite     5   idle n[12-16]
+hipri        up   infinite     5   idle n[12-16]
+</PRE>
+<P>
+Here are the Partition settings:
+</P>
+<PRE>
+[user@n16 ~]$ <B>grep PartitionName /shared/slurm/slurm.conf</B>
+PartitionName=active Priority=1 Default=YES Shared=FORCE:1 Nodes=n[12-16]
+PartitionName=hipri  Priority=2             Shared=FORCE:1 Nodes=n[12-16]
+[user@n16 ~]$ 
+</PRE>
+<P>
+The <I>runit.pl</I> script launches a simple load-generating app that runs
+for the given number of seconds. Submit 5 single-node <I>runit.pl</I> jobs to
+run on all nodes:
+</P>
+<PRE>
+[user@n16 ~]$ <B>sbatch -N1 ./runit.pl 300</B>
+sbatch: Submitted batch job 485
+[user@n16 ~]$ <B>sbatch -N1 ./runit.pl 300</B>
+sbatch: Submitted batch job 486
+[user@n16 ~]$ <B>sbatch -N1 ./runit.pl 300</B>
+sbatch: Submitted batch job 487
+[user@n16 ~]$ <B>sbatch -N1 ./runit.pl 300</B>
+sbatch: Submitted batch job 488
+[user@n16 ~]$ <B>sbatch -N1 ./runit.pl 300</B>
+sbatch: Submitted batch job 489
+[user@n16 ~]$ <B>squeue</B>
+JOBID PARTITION     NAME   USER  ST   TIME  NODES NODELIST
+  485    active runit.pl   user   R   0:06      1 n12
+  486    active runit.pl   user   R   0:06      1 n13
+  487    active runit.pl   user   R   0:05      1 n14
+  488    active runit.pl   user   R   0:05      1 n15
+  489    active runit.pl   user   R   0:04      1 n16
+[user@n16 ~]$
+</PRE>
+<P>
+Now submit a short-running 3-node job to the <I>hipri</I> partition:
+</P>
+<PRE>
+[user@n16 ~]$ <B>sbatch -N3 -p hipri ./runit.pl 30</B>
+sbatch: Submitted batch job 490
+[user@n16 ~]$ <B>squeue</B>
+JOBID PARTITION     NAME   USER  ST   TIME  NODES NODELIST
+  488    active runit.pl   user   R   0:29      1 n15
+  489    active runit.pl   user   R   0:28      1 n16
+  485    active runit.pl   user   S   0:27      1 n12
+  486    active runit.pl   user   S   0:27      1 n13
+  487    active runit.pl   user   S   0:26      1 n14
+  490     hipri runit.pl   user   R   0:03      3 n[12-14]
+[user@n16 ~]$
+</PRE>
+<P>
+Job 490 in the <I>hipri</I> partition preempted jobs 485, 486, and 487 from
+the <I>active</I> partition. Jobs 488 and 489 in the <I>active</I> partition
+remained running.
+</P>
+<P>
+This state persisted until job 490 completed, at which point the preempted jobs
+were resumed:
+</P>
+<PRE>
+[user@n16 ~]$ <B>squeue</B>
+JOBID PARTITION     NAME   USER  ST   TIME  NODES NODELIST
+  485    active runit.pl   user   R   0:30      1 n12
+  486    active runit.pl   user   R   0:30      1 n13
+  487    active runit.pl   user   R   0:29      1 n14
+  488    active runit.pl   user   R   0:59      1 n15
+  489    active runit.pl   user   R   0:58      1 n16
+[user@n16 ~]$
+</PRE>
+
+
+<H2><A NAME="future_work">Future Work</A></H2>
+
+<P>
+<B>Preemption with consumable resources</B>: This implementation of preemption
+relies on intelligent job placement by the <I>select</I> plugins. As of SLURM
+1.3.1 the consumable resource <I>select/cons_res</I> plugin still needs
+additional enhancements to the job placement algorithm before it's preemption
+support can be considered "competent". The mechanics of preemption work, but the
+placement of preemptive jobs relative to any low-priority jobs may not be
+optimal. The work to improve the placement of preemptive jobs relative to
+existing jobs is currently in-progress. 
+</P>
+<P>
+<B>Requeue a preempted job</B>: In some situations is may be desirable to
+requeue a low-priority job rather than suspend it. Suspending a job leaves the
+job in memory. Requeuing a job involves terminating the job and resubmitting it
+again. This will be investigated at some point in the future. Requeuing a
+preempted job may make the most sense with <I>Shared=NO</I> partitions.
+</P>
+
+<p style="text-align:center;">Last modified 11 April 2008</p>
+
+<!--#include virtual="footer.txt"-->
diff --git a/doc/html/quickstart_admin.shtml b/doc/html/quickstart_admin.shtml
index b92f98c1090..203e4e8ea5f 100644
--- a/doc/html/quickstart_admin.shtml
+++ b/doc/html/quickstart_admin.shtml
@@ -224,7 +224,8 @@ The Maui Scheduler</a> or
 <a href="http://www.clusterresources.com/pages/products/moab-cluster-suite.php">
 Moab Cluster Suite</a> which offer sophisticated scheduling algorithms.
 For more information about these options see
-<a href="gang_scheduling.html">Gang Scheduling</a> and
+<a href="gang_scheduling.html">Gang Scheduling</a>,
+<a href="preempt.html">Preemption</a> and
 <a href="cons_res_share.html">Sharing Consumable Resources</a>.</p> 
 
 <h3>Node selection</h3>
-- 
GitLab