From cb7d8e30f3dae68d1355aa582bde654497fa076b Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 11 Aug 2009 19:18:39 +0000
Subject: [PATCH] Add "GANG" flag to PreemptMode in order to control whether or
 not gang scheduling is performed.

---
 NEWS                                          |  5 ++-
 doc/html/gang_scheduling.shtml                | 19 ++++----
 doc/man/man5/slurm.conf.5                     | 29 ++++++------
 slurm/slurm.h.in                              |  1 +
 src/api/config_info.c                         | 43 +++++++++++++-----
 src/common/read_config.c                      | 44 +++++++++++++------
 src/plugins/select/cons_res/select_cons_res.c |  1 +
 src/plugins/select/linear/select_linear.c     |  1 +
 src/slurmctld/gang.c                          |  1 +
 src/slurmctld/read_config.c                   | 16 +++----
 src/slurmctld/sched_plugin.c                  | 18 ++++----
 11 files changed, 111 insertions(+), 67 deletions(-)

diff --git a/NEWS b/NEWS
index 82ad136bcb5..2a7adbe1e2a 100644
--- a/NEWS
+++ b/NEWS
@@ -6,8 +6,9 @@ documents those changes that are of interest to users and admins.
  -- Removed sched/gang plugin and moved the logic directly into the slurmctld
     daemon so that job preemption and gang scheduling can be used with the
     sched/backfill plugin. Added configuration parameter:
-    PreemptMode=off|suspend|cancel|requeue|checkpoint 
-    to enable/disable the job preemption logic (disabled by default).
+    PreemptMode=gang|off|suspend|cancel|requeue|checkpoint 
+    to enable/disable gang scheduling and job preemption logic (both are 
+    disabled by default).
     (NOTE: There are some problems with memory management which could prevent a
     job from starting when memory would be freed by a job being requeued or 
     otherwise removed, these are being worked on)
diff --git a/doc/html/gang_scheduling.shtml b/doc/html/gang_scheduling.shtml
index 382a977d548..4722a80e2a4 100644
--- a/doc/html/gang_scheduling.shtml
+++ b/doc/html/gang_scheduling.shtml
@@ -79,8 +79,9 @@ memory limits, it will be canceled in order to prevent it from
 adversely effecting other jobs sharing the same resources.
 </LI>
 <LI>
-<B>PreemptMode</B>: Configure to <I>SUSPEND</I> or <I>KILL</I> (any value
-other than <I>OFF</I> enables gang scheduling).
+<B>PreemptMode</B>: set the <I>GANG</I> option.
+Additional options may be specified to enable job preemption in addition
+to gang scheduling.
 </LI>
 <LI>
 <B>SchedulerTimeSlice</B>: The default timeslice interval is 30 seconds. 
@@ -163,16 +164,16 @@ PARTITION AVAIL  TIMELIMIT NODES  STATE NODELIST
 active*      up   infinite     5   idle n[12-16]
 </PRE>
 <P>
-Here are the Scheduler settings (the last two settings are the relevant ones):
+Here are the Scheduler settings (excerpt of output):
 </P>
 <PRE>
-[user@n16 load]$ <B>scontrol show config | grep Sched</B>
-FastSchedule            = 1
-PreemptMode             = SUSPEND
-SchedulerPort           = 7321
-SchedulerRootFilter     = 1
+[user@n16 load]$ <B>scontrol show config</B>
+...
+PreemptMode             = GANG
+...
 SchedulerTimeSlice      = 30
 SchedulerType           = sched/builtin
+...
 </PRE>
 <P>
 The <I>myload</I> script launches a simple load-generating app that runs
@@ -520,6 +521,6 @@ For now this idea could be experimented with by disabling memory support in
 the selector and submitting appropriately sized jobs.
 </P>
 
-<p style="text-align:center;">Last modified 9 July 2009</p>
+<p style="text-align:center;">Last modified 11 August 2009</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 9c4284101ed..17c0883a5f6 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -1,4 +1,4 @@
-.TH "slurm.conf" "5" "July 2009" "slurm.conf 2.1" "Slurm configuration file"
+.TH "slurm.conf" "5" "August 2009" "slurm.conf 2.1" "Slurm configuration file"
 
 .SH "NAME"
 slurm.conf \- Slurm configuration file 
@@ -733,21 +733,19 @@ on SPANK plugins, see the \fBspank\fR(8) manual.
 
 .TP
 \fBPreemptMode\fR
-Controls the mechanism used to preempt lower priority jobs in order to 
-execute higher priority jobs.
-For gang scheduling of jobs having the same priority, job suspend will
-always be used and \fBPreemptMode\fR must be set to \fICANCEL\fR, 
-\fIREQUEUE\fR or \fISUSPEND\fR (anything other than \fIOFF\fR).
-Two different mechanisms can be used to determine when preeption can 
-take place. 
+Enables gang scheduling and/or controls the mechanism used to preempt lower 
+priority jobs in order to execute higher priority jobs.
+For gang scheduling (time slicing) of jobs in the same partition, 
+job suspend will always be used.
 SLURM partitions can have different \fBPriority\fR values and contain 
 the same nodes, in which case jobs from the higher priority 
-partition will preempt jobs from the lower priority partition.
-Alternately when SLURM is used with an accounting database, the jobs can be
-submitted to the same partition but have different Quality Of Service (QOS) 
-values. 
-Jobs from a QOS with a descriptor containing \fIPREEMPTOR\fR are capable
-of preempting jobs from a QOS with a descriptor containing \fIPREEMPTEE\fR.
+partition can preempt jobs from the lower priority partition.
+The \fBGANG\fR option can be specified in addition to a preemption 
+method specification with the two options comma separated.
+NOTE: \fBGANG\fR must be specified for job preemption to occur.
+Changes are underway to permit job preemption without gang scheduling
+and job preemption based upon QOS (Quality Of Service) rather than 
+partition priority.
 .RS
 .TP 12
 \fBOFF\fR
@@ -762,6 +760,9 @@ always cancel the job.
 \fBCHECKPOINT\fR
 preempts jobs by checkpointing them (if possible) or cancelling them.
 .TP
+\fBGANG\fR
+enables gang scheduling (time slicing) of jobs in the same partition.
+.TP
 \fBREQUEUE\fR
 preempts jobs by requeuing them (if possible) or cancelling them.
 .TP
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index f09bafc9356..d5f89258846 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -1209,6 +1209,7 @@ typedef struct reservation_name_msg {
 #define PREEMPT_MODE_CHECKPOINT	0x0003	/* checkpoint job to preempt, 
 					 * no automatic restart */
 #define PREEMPT_MODE_CANCEL	0x0004	/* always cancel the job */
+#define PREEMPT_MODE_GANG	0x8000	/* enable gang scheduling */
 
 typedef struct slurm_ctl_conf {
 	time_t last_update;	/* last update time of the build parameters */
diff --git a/src/api/config_info.c b/src/api/config_info.c
index 1da7e62df9f..425a4f8f2a6 100644
--- a/src/api/config_info.c
+++ b/src/api/config_info.c
@@ -116,6 +116,37 @@ _reset_period_str(uint16_t reset_period)
 	}
 }
 
+static char *
+_preempt_str(uint16_t preempt_mode)
+{
+	char *gang_str;
+	static char preempt_str[64];
+
+	if (preempt_mode == PREEMPT_MODE_OFF)
+		return "OFF";
+	if (preempt_mode == PREEMPT_MODE_GANG)
+		return "GANG";
+
+	if (preempt_mode & PREEMPT_MODE_GANG) {
+		gang_str = "GANG,";
+		preempt_mode &= (~PREEMPT_MODE_GANG);
+	} else
+		gang_str = "";
+
+	if      (preempt_mode == PREEMPT_MODE_CANCEL)
+		sprintf(preempt_str, "%sCANCEL", gang_str);
+	else if (preempt_mode == PREEMPT_MODE_CHECKPOINT)
+		sprintf(preempt_str, "%sCHECKPOINT", gang_str);
+	else if (preempt_mode == PREEMPT_MODE_REQUEUE)
+		sprintf(preempt_str, "%sREQUEUE", gang_str);
+	else if (preempt_mode == PREEMPT_MODE_SUSPEND)
+		sprintf(preempt_str, "%sSUSPEND", gang_str);
+	else
+		sprintf(preempt_str, "%sUNKNOWN", gang_str);
+
+	return preempt_str;
+}
+
 /*
  * slurm_print_ctl_conf - output the contents of slurm control configuration 
  *	message as loaded using slurm_load_ctl_conf
@@ -299,16 +330,8 @@ void slurm_print_ctl_conf ( FILE* out,
 	fprintf(out, "PlugStackConfig         = %s\n",
 		slurm_ctl_conf_ptr->plugstack);
 
-	if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_OFF)
-		fprintf(out, "PreemptMode             = OFF\n");
-	else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_CANCEL)
-		fprintf(out, "PreemptMode             = CANCEL\n");
-	else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_CHECKPOINT)
-		fprintf(out, "PreemptMode             = CHECKPOINT\n");
-	else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_REQUEUE)
-		fprintf(out, "PreemptMode             = REQUEUE\n");
-	else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_SUSPEND)
-		fprintf(out, "PreemptMode             = SUSPEND\n");
+	fprintf(out, "PreemptMode             = %s\n",
+		_preempt_str(slurm_ctl_conf_ptr->preempt_mode));
 
 	if (strcmp(slurm_ctl_conf_ptr->priority_type, "priority/basic") == 0) {
 		fprintf(out, "PriorityType            = %s\n",
diff --git a/src/common/read_config.c b/src/common/read_config.c
index f56180da69d..0b2577fa716 100644
--- a/src/common/read_config.c
+++ b/src/common/read_config.c
@@ -2119,20 +2119,36 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl)
 		conf->plugstack = xstrdup(default_plugstack);
 
 	if (s_p_get_string(&temp_str, "PreemptMode", hashtbl)) {
-		if (strcasecmp(temp_str, "off") == 0)
-			conf->preempt_mode = PREEMPT_MODE_OFF;
-		else if (strcasecmp(temp_str, "cancel") == 0)
-			conf->preempt_mode = PREEMPT_MODE_CANCEL;
-		else if (strcasecmp(temp_str, "checkpoint") == 0)
-			conf->preempt_mode = PREEMPT_MODE_CHECKPOINT;
-		else if (strcasecmp(temp_str, "requeue") == 0)
-			conf->preempt_mode = PREEMPT_MODE_REQUEUE;
-		else if ((strcasecmp(temp_str, "on") == 0) ||
-			 (strcasecmp(temp_str, "suspend") == 0))
-			conf->preempt_mode = PREEMPT_MODE_SUSPEND;
-
-		else
-			fatal("Invalid PreemptMode: %s", temp_str);
+		int preempt_modes = 0;
+		char *last = NULL, *tok;
+		conf->preempt_mode = 0;
+		tok = strtok_r(temp_str, ",", &last);
+		while (tok) {
+			if (strcasecmp(tok, "gang") == 0) {
+				conf->preempt_mode |= PREEMPT_MODE_GANG;
+			} else if (strcasecmp(tok, "off") == 0) {
+				conf->preempt_mode += PREEMPT_MODE_OFF;
+				preempt_modes++;
+			} else if (strcasecmp(tok, "cancel") == 0) {
+				conf->preempt_mode += PREEMPT_MODE_CANCEL;
+				preempt_modes++;
+			} else if (strcasecmp(tok, "checkpoint") == 0) {
+				conf->preempt_mode += PREEMPT_MODE_CHECKPOINT;
+				preempt_modes++;
+			} else if (strcasecmp(tok, "requeue") == 0) {
+				conf->preempt_mode += PREEMPT_MODE_REQUEUE;
+				preempt_modes++;
+			} else if ((strcasecmp(tok, "on") == 0) ||
+				 (strcasecmp(tok, "suspend") == 0)) {
+				conf->preempt_mode += PREEMPT_MODE_SUSPEND;
+				preempt_modes++;
+			} else
+				fatal("Invalid PreemptMode: %s", tok);
+			tok = strtok_r(NULL, ",", &last);
+		}
+		xfree(temp_str);
+		if (preempt_modes > 1)
+			fatal("More than one PreemptMode specified");
 	}
 
 	if (s_p_get_string(&temp_str, "PriorityDecayHalfLife", hashtbl)) {
diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c
index f955843e47c..798294e5fb5 100644
--- a/src/plugins/select/cons_res/select_cons_res.c
+++ b/src/plugins/select/cons_res/select_cons_res.c
@@ -253,6 +253,7 @@ extern bool cr_preemption_enabled(void)
 {
 	if (!job_preemption_tested) {
 		uint16_t mode = slurm_get_preempt_mode();
+		mode &= ~PREEMPT_MODE_GANG;
 		if (mode == PREEMPT_MODE_SUSPEND)
 			job_preemption_enabled = true;
 		else if ((mode == PREEMPT_MODE_CANCEL)     ||
diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c
index 83ab35d3714..bef6c6006a5 100644
--- a/src/plugins/select/linear/select_linear.c
+++ b/src/plugins/select/linear/select_linear.c
@@ -277,6 +277,7 @@ static inline bool _job_preemption_enabled(void)
 {
 	if (!job_preemption_tested) {
 		uint16_t mode = slurm_get_preempt_mode();
+		mode &= (~PREEMPT_MODE_GANG);
 		if (mode == PREEMPT_MODE_SUSPEND)
 			job_preemption_enabled = true;
 		else if ((mode == PREEMPT_MODE_CANCEL)     ||
diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c
index d85f47a2cbe..58c3e411cb4 100644
--- a/src/slurmctld/gang.c
+++ b/src/slurmctld/gang.c
@@ -733,6 +733,7 @@ static void _preempt_job_dequeue(void)
 	uint32_t job_id, *tmp_id;
 	uint16_t preempt_mode = slurm_get_preempt_mode();
 
+	preempt_mode &= (~PREEMPT_MODE_GANG);
 	while ((tmp_id = list_pop(preempt_job_list))) {
 		job_id = *tmp_id;
 		xfree(tmp_id);
diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c
index 29d53abf2e3..2cb3bc7f888 100644
--- a/src/slurmctld/read_config.c
+++ b/src/slurmctld/read_config.c
@@ -1042,28 +1042,26 @@ static int  _preserve_select_type_param(slurm_ctl_conf_t *ctl_conf_ptr,
 }
 
 /* Start or stop the gang scheduler module as needed based upon changes in 
- *	job preemption support */
+ *	configuration */
 static int _update_preempt(uint16_t old_preempt_mode)
 {
 	uint16_t new_preempt_mode = slurm_get_preempt_mode();
 
-	if ((old_preempt_mode == PREEMPT_MODE_OFF) == 
-	    (new_preempt_mode == PREEMPT_MODE_OFF))
+	if ((old_preempt_mode & PREEMPT_MODE_GANG) == 
+	    (new_preempt_mode & PREEMPT_MODE_GANG))
 		return SLURM_SUCCESS;
 
-	if (old_preempt_mode == PREEMPT_MODE_OFF) {
-		info("Enabling job preemption and gang scheduling");
+	if (new_preempt_mode & PREEMPT_MODE_GANG) {
+		info("Enabling gang scheduling");
 		return gs_init();
 	}
 
-	if (new_preempt_mode == PREEMPT_MODE_OFF) {
-		info("Disabling job preemption and gang scheduling");
+	if (old_preempt_mode == PREEMPT_MODE_GANG) {
+		info("Disabling gang scheduling");
 		gs_wake_jobs();
 		return gs_fini();
 	}
 
-	error("Invalid value for EnablePreemption (old:%u new:%u)",
-	      old_preempt_mode, new_preempt_mode);
 	return EINVAL;
 }
 
diff --git a/src/slurmctld/sched_plugin.c b/src/slurmctld/sched_plugin.c
index 56df2f74a72..e924a8adc51 100644
--- a/src/slurmctld/sched_plugin.c
+++ b/src/slurmctld/sched_plugin.c
@@ -17,7 +17,7 @@
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
+ *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and 
  *  distribute linked combinations including the two. You must obey the GNU 
  *  General Public License in all respects for all of the code used other than 
@@ -240,7 +240,7 @@ slurm_sched_init( void )
 		goto done;
 	}
 
-	if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && 
+	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && 
 	     (gs_init() != SLURM_SUCCESS))
 		error( "cannot start gang scheduler ");
 
@@ -264,7 +264,7 @@ slurm_sched_fini( void )
 	rc = slurm_sched_context_destroy(g_sched_context);
 	g_sched_context = NULL;
 
-	if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) &&
+	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && 
 	     (gs_fini() != SLURM_SUCCESS))
 		error( "cannot stop gang scheduler" );
 
@@ -281,7 +281,7 @@ slurm_sched_reconfig( void )
 	if ( slurm_sched_init() < 0 )
 		return SLURM_ERROR;
 
-	if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) &&
+	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && 
 	     (gs_reconfig() != SLURM_SUCCESS))
 		error( "cannot reconfigure gang scheduler" );
 
@@ -299,7 +299,7 @@ slurm_sched_schedule( void )
 
 #if 0
 	/* synchronize job listings? Here? */
-	if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) &&
+	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && 
 	     (gs_job_scan() != SLURM_SUCCESS))
 		error( "gang scheduler could not rescan jobs" );
 #endif
@@ -316,7 +316,7 @@ slurm_sched_newalloc( struct job_record *job_ptr )
 	if ( slurm_sched_init() < 0 )
 		return SLURM_ERROR;
 
-	if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && 
+	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && 
 	     (gs_job_start( job_ptr ) != SLURM_SUCCESS)) {
 		error( "gang scheduler problem starting job %u", 
 		       job_ptr->job_id);
@@ -334,7 +334,7 @@ slurm_sched_freealloc( struct job_record *job_ptr )
 	if ( slurm_sched_init() < 0 )
 		return SLURM_ERROR;
 
-	if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && 
+	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && 
 	     (gs_job_fini( job_ptr ) != SLURM_SUCCESS)) {
 		error( "gang scheduler problem finishing job %u", 
 		       job_ptr->job_id);
@@ -367,7 +367,7 @@ slurm_sched_job_is_pending( void )
 	if ( slurm_sched_init() < 0 )
 		return;
 
-	if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) &&
+	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && 
 	     (gs_reconfig() != SLURM_SUCCESS))
 		error( "cannot reconfigure gang scheduler" );
 
@@ -385,7 +385,7 @@ slurm_sched_partition_change( void )
 
 #if 0
 	/* synchronize job listings? Here? */
-	if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) &&
+	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && 
 	     (gs_job_scan() != SLURM_SUCCESS))
 		error( "gang scheduler could not rescan jobs" );
 #endif
-- 
GitLab