From ee4b3e9971df8f3d02e4c3c140c753c231f2eb42 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 19 Aug 2009 18:29:46 +0000
Subject: [PATCH] Change default PreemptionType to preempt/none Validity check
 values PreemptionType and PreemptMode for compatability Update documentation
 on PreemptionType and PreemptMode

---
 doc/html/preempt.shtml    | 96 +++++++++++++++++++++++----------------
 doc/man/man5/slurm.conf.5 | 21 ++++-----
 src/common/read_config.c  | 16 +++++++
 src/common/read_config.h  |  2 +-
 4 files changed, 83 insertions(+), 52 deletions(-)

diff --git a/doc/html/preempt.shtml b/doc/html/preempt.shtml
index 5b2d4843ce2..adbe05226b5 100644
--- a/doc/html/preempt.shtml
+++ b/doc/html/preempt.shtml
@@ -18,21 +18,21 @@ resources if so configured in newer versions of SLURM.
 In SLURM version 2.0 and earlier, high priority work is identified by the 
 priority of the job's partition and low priority jobs are always suspended.
 The job preemption logic is within the <I>sched/gang</I> plugin.
+In SLURM version 2.1 and higher, the job's partition priority or its 
+Quality Of Service (QOS) can be used to identify the which jobs can preempt 
+or be preempted by other jobs.
 </P>
 <P>
-<!--In SLURM version 2.1 and higher, the job's partition priority or its 
-Quality Of Service (QOS) can be used to identify the which jobs can preempt 
-or be preempted by other jobs.-->
-SLURM version 2.1 offers the option of requeuing low priority jobs
-rather than suspending them, which may permit the preempted jobs to restart 
-faster by using different resources.
-SLURM version 2.1 also offers the option of checkpointing low priority jobs
-rather than suspending them
+SLURM version 2.1 offers several options for the job preemption mechanism
+including checkpoint, requeue, or cancel.
+the option of requeuing low priority jobs 
 Checkpointed jobs are not automatically requeued or restarted.
-Requeuing jobs also releases their memory space for use by other jobs.
-In SLURM version 2.1, the job preemption logic was moved directly into the
-main code bases to permit use of both job preemption plus the backfill 
-scheduler plugin, <i>sched/backfill</I>.
+Requeued jobs may restart faster by using different resources.
+All of these new job preemption mechanisms release a job's memory space for 
+use by other jobs.
+In SLURM version 2.1, some job preemption logic was moved into the
+<I>select</I> plugin and main code base to permit use of both job preemption 
+plus the backfill scheduler plugin, <i>sched/backfill</I>.
 </P>
 
 <H2>Configuration</H2>
@@ -46,13 +46,15 @@ There are several important configuration parameters relating to preemption:
 <I>select/cons_res</I> plugin.
 </LI>
 <LI>
-<B>SelectTypeParameter</B>: Since resources may be getting overallocated 
+<B>SelectTypeParameter</B>: Since resources may be getting over-allocated 
 with jobs (suspended jobs remain in memory), the resource selection
 plugin should be configured to track the amount of memory used by each job to
 ensure that memory page swapping does not occur. When <I>select/linear</I> is
 chosen, we recommend setting <I>SelectTypeParameter=CR_Memory</I>. When
 <I>select/cons_res</I> is chosen, we recommend including Memory as a resource
-(ex. <I>SelectTypeParameter=CR_Core_Memory</I>).
+(ex. <I>SelectTypeParameter=CR_Core_Memory</I>). 
+<BR><B>NOTE:</B> Unless <I>PreemptMode=SUSPEND,GANG</I> these memory management
+parameters are not critical.
 </LI>
 <LI>
 <B>DefMemPerCPU</B>: Since job requests may not explicitly specify 
@@ -64,6 +66,8 @@ It may also be desirable to configure
 <I>MaxMemPerNode</I> (maximum memory per allocated node) in <I>slurm.conf</I>.
 Users can use the <I>--mem</I> or <I>--mem-per-cpu</I> option
 at job submission time to specify their memory requirements.
+<BR><B>NOTE:</B> Unless <I>PreemptMode=SUSPEND,GANG</I> these memory management
+parameters are not critical.
 </LI>
 <LI>
 <B>JobAcctGatherType and JobAcctGatherFrequency</B>: The "maximum data segment
@@ -74,6 +78,8 @@ accounting with the <I>JobAcctGatherType</I> and <I>JobAcctGatherFrequency</I>
 parameters. When accounting is enabled and a job exceeds its configured memory
 limits, it will be canceled in order to prevent it from adversely effecting
 other jobs sharing the same resources.
+<BR><B>NOTE:</B> Unless <I>PreemptMode=SUSPEND,GANG</I> these memory management
+parameters are not critical.
 </LI>
 <LI>
 <B>PreemptMode</B>: Configure to <I>CANCEL</I>, <I>CHECKPOINT</I>, 
@@ -87,17 +93,38 @@ Checkpointed jobs are not automatically restarted.
 <LI>A value of <I>REQUEUE</I> will requeue (if possible) or kill low priority 
 jobs. Requeued jobs are permitted to be restarted on different resources.</LI>
 <LI>A value of <I>SUSPEND</I> will suspend and automatically resume the low 
-priority jobs. </LI>
+priority jobs. The <I>SUSPEND</I> option must be used with the <I>GANG</I>
+option (e.g. "PreemptMode=SUSPEND,GANG").</LI>
+</UL>
+</LI>
+<LI>
+<B>PreemptType</B>: Configure to the desired mechanism used to identify
+which jobs can preempt other jobs.
+<UL>
+<LI><I>preempt/none</I> indicates that jobs will not preempt each other
+(default).</LI>
+<LI><I>preempt/partition_prio</I> indicates that jobs from one partition 
+can preempt jobs from lower priority partitions.</LI>
+<LI><I>preempt/qos</I> indicates that jobs from one Quality Of Service (QOS) 
+can preempt jobs from a lower QOS. These jobs can be in the same partition 
+or different partitions. PreemptMode must be set to CANCEL, CHECKPOINT, 
+SUSPEND or REQUEUE. This option requires the use of a database identifying
+available QOS and their preemption rules. </LI>
 </UL>
 </LI>
 <LI>
 <B>Priority</B>: Configure the partition's <I>Priority</I> setting relative to
-other partitions to control the preemptive behavior. If two jobs from two
+other partitions to control the preemptive behavior when 
+<I>PreemptType=preempt/partition_prio</I>. 
+This option is not relevant if <I>PreemptType=preempt/qos</I>. 
+If two jobs from two
 different partitions are allocated to the same resources, the job in the
 partition with the greater <I>Priority</I> value will preempt the job in the
 partition with the lesser <I>Priority</I> value. If the <I>Priority</I> values
 of the two partitions are equal then no preemption will occur. The default
 <I>Priority</I> value is 1.
+<BR><B>NOTE:</B> Unless <I>PreemptType=preempt/partition_prio</I> the
+partition <I>Priority</I> is not critical.
 </LI>
 <LI>
 <B>Shared</B>: Configure the partition's <I>Shared</I> setting to 
@@ -120,6 +147,16 @@ SLURM requires a full restart of the daemons. If you just change the partition
 
 <H2>Preemption Design and Operation</H2>
 
+<P>
+The select plugin will identify resources where a pending job can begin 
+execution.
+When <I>PreemptMode</I> is configured to CANCEL, CHECKPOINT, SUSPEND or 
+REQUEUE, the select plugin will also preempt running jobs as needed to 
+initiate the pending job. 
+When <I>PreemptMode=SUSPEND,GANG</I> the select plugin will initiate the 
+pending job and rely upon the gang scheduling logic to perform job suspend 
+and resume as described below.
+</P>
 <P>
 When enabled, the gang scheduling logic (which is also supports job 
 preemption) keeps track of the resources allocated to all jobs. 
@@ -166,7 +203,8 @@ A good way to observe the operation of the gang scheduler is by running
 
 <P>
 The following example is configured with <I>select/linear</I> and
-<I>PreemptMode=SUSPEND</I>. This example takes place on a cluster of 5 nodes:
+<I>PreemptMode=SUSPEND,GANG</I>. 
+This example takes place on a cluster of 5 nodes:
 </P>
 <PRE>
 [user@n16 ~]$ <B>sinfo</B>
@@ -291,28 +329,6 @@ order to support ideal placements such as this, which can quickly complicate
 the design. Any and all help is welcome here!
 </P>
 
-<P>
-<B>Better memory management</B>: Some additional work is required to 
-better track memory when preempted jobs are checkpointed, requeued or
-killed. 
-In those cases, the memory originally assigned to the job is freed.
-Some addition logic is required to properly track memory being freed
-by these preempted jobs, especially for the select/cons_res plugin,
-which typically runs multiple jobs per node.
-</P>
-
-<P>
-<B>Job preemption based upon Quality of Service (QOS) rather than 
-partition priority</B>: Granting user's a QOS with preemption capabilities
-could be used as an alternative to prartition priorities. 
-Granting some user access to a QOS with a descriptor of <I>PREEMPTOR</I> 
-could be used to grant him the ability to preempt any job from any partition
-without creating various partitions with different priorities.
-However this mechanism lacks the ability to support many priority levels.
-It would also necessitate major changes to the shadow bitmap data structures
-which are currently used.
-</P>
-
-<p style="text-align:center;">Last modified 20 July 2009</p>
+<p style="text-align:center;">Last modified 19 August 2009</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 0fc0faae8b5..d3ff9a75788 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -738,19 +738,14 @@ on SPANK plugins, see the \fBspank\fR(8) manual.
 
 .TP
 \fBPreemptMode\fR
-Enables gang scheduling and/or controls the mechanism used to preempt lower 
-priority jobs in order to execute higher priority jobs.
-For gang scheduling (time slicing) of jobs in the same partition, 
-job suspend will always be used.
-SLURM partitions can have different \fBPriority\fR values and contain 
-the same nodes, in which case jobs from the higher priority 
-partition can preempt jobs from the lower priority partition.
+Enables gang scheduling and/or controls the mechanism used to preempt jobs.
+The \fBPreemptType\fR parameter controls the mechanism used to determine
+which jobs are capable of preempting other jobs while \fBPreemptMode\fR
+controls the mechanism used to preempt the lower priority jobs.
+Jobs which are being gang scheduled (time sliced) rather than being preempted
+will always make use of a suspend/resume mechanism.
 The \fBGANG\fR option can be specified in addition to a preemption 
 method specification with the two options comma separated.
-NOTE: \fBGANG\fR must be specified for job preemption to occur.
-Changes are underway to permit job preemption without gang scheduling
-and job preemption based upon QOS (Quality Of Service) rather than 
-partition priority.
 .RS
 .TP 12
 \fBOFF\fR
@@ -775,6 +770,8 @@ preempts jobs by requeuing them (if possible) or cancelling them.
 preempts jobs by suspending them.
 A suspended job will resume execution once the high priority job 
 preempting it completes.
+The \fBSUSPEND\fR may only be used with the \fBGANG\fR option
+(the gang scheduler module performs the job resume operation). 
 .RE
 
 .TP
@@ -795,6 +792,8 @@ priority partitions.
 \fBpreempt/qos\fR
 Job preemption rules are specified by Quality Of Service (QOS) specifications
 in the SLURM database a database.
+This is not compatible with \fBPreemptMode=OFF\fR or \fBPreemptMode=SUSPEND\fR
+(i.e. preempted jobs must be removed from the resources).
 .RE
 
 .TP
diff --git a/src/common/read_config.c b/src/common/read_config.c
index a7fc94fc2af..335296e5e77 100644
--- a/src/common/read_config.c
+++ b/src/common/read_config.c
@@ -2165,9 +2165,25 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl)
 		xfree(temp_str);
 		if (preempt_modes > 1)
 			fatal("More than one PreemptMode specified");
+		if (conf->preempt_mode == PREEMPT_MODE_SUSPEND)
+			fatal("PreemptMode=SUSPEND requires GANG too");
 	}
 	if (!s_p_get_string(&conf->preempt_type, "PreemptType", hashtbl))
 		conf->preempt_type = xstrdup(DEFAULT_PREEMPT_TYPE);
+	if (strcmp(conf->preempt_type, "preempt/qos") == 0) {
+		int preempt_mode = conf->preempt_mode & (~PREEMPT_MODE_GANG);
+		if ((preempt_mode == PREEMPT_MODE_OFF) ||
+		    (preempt_mode == PREEMPT_MODE_SUSPEND)) {
+			fatal("PreemptType and PreemptMode values "
+			      "incompatible");
+		}
+	} else if (strcmp(conf->preempt_type, "preempt/none") == 0) {
+		int preempt_mode = conf->preempt_mode & (~PREEMPT_MODE_GANG);
+		if (preempt_mode != PREEMPT_MODE_OFF) {
+			fatal("PreemptType and PreemptMode values "
+			      "incompatible");
+		}
+	}
 
 	if (s_p_get_string(&temp_str, "PriorityDecayHalfLife", hashtbl)) {
 		int max_time = time_str2mins(temp_str);
diff --git a/src/common/read_config.h b/src/common/read_config.h
index 035e33f8d63..695785cbd85 100644
--- a/src/common/read_config.h
+++ b/src/common/read_config.h
@@ -96,7 +96,7 @@ extern char *default_plugstack;
 #  define DEFAULT_CHECKPOINT_TYPE   "checkpoint/none"
 #  define DEFAULT_PROCTRACK_TYPE    "proctrack/pgid"
 #endif
-#define DEFAULT_PREEMPT_TYPE        "preempt/partition_prio"
+#define DEFAULT_PREEMPT_TYPE        "preempt/none"
 #define DEFAULT_PRIORITY_DECAY      604800 /* 7 days */
 #define DEFAULT_PRIORITY_TYPE       "priority/basic"
 #define DEFAULT_PROPAGATE_PRIO_PROCESS 0
-- 
GitLab