From e33dd1f8085d970250c0fcc45419aed045713be1 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 14 Jul 2009 23:44:46 +0000
Subject: [PATCH] Rename PreemptMode=kill to requeue. Add
 PreemptMode-checkpoint (needs more testing).

---
 NEWS                                          |  5 +-
 doc/html/preempt.shtml                        | 26 +++++--
 doc/man/man5/slurm.conf.5                     |  7 +-
 slurm/slurm.h.in                              |  4 +-
 src/api/config_info.c                         |  6 +-
 src/common/read_config.c                      |  4 +-
 src/plugins/select/cons_res/select_cons_res.c |  3 +-
 src/plugins/select/linear/select_linear.c     |  3 +-
 src/slurmctld/gang.c                          | 77 +++++++++++++------
 9 files changed, 98 insertions(+), 37 deletions(-)

diff --git a/NEWS b/NEWS
index 825008e2f5e..168d3f984ca 100644
--- a/NEWS
+++ b/NEWS
@@ -5,8 +5,9 @@ documents those changes that are of interest to users and admins.
 =============================
  -- Removed sched/gang plugin and moved the logic directly into the slurmctld
     daemon so that job preemption and gang scheduling can be used with the
-    sched/backfill plugin. Added PreemptMode=off|suspend|kill configuration 
-    parameter to enable/disable the job preemption logic (disabled by default).
+    sched/backfill plugin. Added configuration parameter:
+    PreemptMode=off|suspend|kill|requeue|checkpoint 
+    to enable/disable the job preemption logic (disabled by default).
  -- If the --partition option is used with the sinfo or squeue command then
     print information about even hidden partitions.
  -- Replaced misc cpu allocation members in job_info_t with select_job_res_t
diff --git a/doc/html/preempt.shtml b/doc/html/preempt.shtml
index c4c8993105a..a1818099662 100644
--- a/doc/html/preempt.shtml
+++ b/doc/html/preempt.shtml
@@ -26,6 +26,9 @@ or be preempted by other jobs.-->
 SLURM version 2.1 offers the option of requeuing low priority jobs
 rather than suspending them, which may permit the preempted jobs to restart 
 faster by using different resources.
+SLURM version 2.1 also offers the option of checkpointing low priority jobs
+rather than suspending them
+Checkpointed jobs are not automatically requeued or restarted.
 Requeuing jobs also releases their memory space for use by other jobs.
 In SLURM version 2.1, the job preemption logic was moved directly into the
 main code bases to permit use of both job preemption plus the backfill 
@@ -73,12 +76,15 @@ limits, it will be canceled in order to prevent it from adversely effecting
 other jobs sharing the same resources.
 </LI>
 <LI>
-<B>PreemptMode</B>: Configure to <I>SUSPEND</I> or <I>KILL</I> depending on
-the desired action for low priority jobs. 
+<B>PreemptMode</B>: Configure to <I>CHECKPOINT</I>, <I>SUSPEND</I> or 
+<I>REQUEUE</I> depending on the desired action for low priority jobs. 
+A value of <I>CHECKPOINT</I> will checkpoint (if possible) or kill low 
+priority jobs.
+Checkpointed jobs are not automatically restarted.
+A value of <I>REQUEUE</I> will requeue (if possible) or kill low priority jobs.
+Requeued jobs are permitted to be restarted on different resources.
 A value of <I>SUSPEND</I> will suspend and automatically resume the low 
 priority jobs. 
-A value of <I>KILL</I> will requeue (if possible) or kill low priority jobs.
-Requeued jobs are permitted to be restarted on different resources.
 </LI>
 <LI>
 <B>Priority</B>: Configure the partition's <I>Priority</I> setting relative to
@@ -281,6 +287,16 @@ order to support ideal placements such as this, which can quickly complicate
 the design. Any and all help is welcome here!
 </P>
 
+<P>
+<B>Better memory management</B>: Some additional work is required to 
+better track memory when preempted jobs are checkpointed, requeued or
+killed. 
+In those cases, the memory originally assigned to the job is freed.
+Some addition logic is required to properly track memory being freed
+by these preempted jobs, especially for the select/cons_res plugin,
+which typically runs multiple jobs per node.
+</P>
+
 <P>
 <B>Job preemption based upon Quality of Service (QOS) rather than 
 partition priority</B>: Granting user's a QOS with preemption capabilities
@@ -293,6 +309,6 @@ It would also necessitate major changes to the shadow bitmap data structures
 which are currently used.
 </P>
 
-<p style="text-align:center;">Last modified 9 July 2009</p>
+<p style="text-align:center;">Last modified 14 July 2009</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index f3b2fd49acf..e84dcf04050 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -740,8 +740,11 @@ This is the only option compatible with \fBSchedulerType=sched/wiki\fR
 or \fBSchedulerType=sched/wiki2\fR (used by Maui and Moab respecitvely, 
 which provide their own job preemption functionality).
 .TP
-\fBKILL\fR
-preempts jobs by killing or requeuing them.
+\fBCHECKPOINT\fR
+preempts jobs by checkpointing them (if possible) or killing them.
+.TP
+\fBREQUEUE\fR
+preempts jobs by requeuing them (if possible) or killing them.
 .TP
 \fBSUSPEND\fR
 preempts jobs by suspending them.
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 7017cf47385..a606fdfbf86 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -1106,7 +1106,9 @@ typedef struct reservation_name_msg {
 
 #define PREEMPT_MODE_OFF	0x0000	/* disable job preemption */
 #define PREEMPT_MODE_SUSPEND	0x0001	/* suspend jobs to preempt */
-#define PREEMPT_MODE_KILL	0x0002	/* requeue or kill jobs to preempt */
+#define PREEMPT_MODE_REQUEUE	0x0002	/* requeue or kill jobs to preempt */
+#define PREEMPT_MODE_CHECKPOINT	0x0003	/* checkpoint job to preempt, 
+					 * no automatic restart */
 
 typedef struct slurm_ctl_conf {
 	time_t last_update;	/* last update time of the build parameters */
diff --git a/src/api/config_info.c b/src/api/config_info.c
index e846e9212d9..8245f212951 100644
--- a/src/api/config_info.c
+++ b/src/api/config_info.c
@@ -303,8 +303,10 @@ void slurm_print_ctl_conf ( FILE* out,
 		fprintf(out, "PreemptMode             = OFF\n");
 	else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_SUSPEND)
 		fprintf(out, "PreemptMode             = SUSPEND\n");
-	else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_KILL)
-		fprintf(out, "PreemptMode             = KILL\n");
+	else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_CHECKPOINT)
+		fprintf(out, "PreemptMode             = CHECKPOINT\n");
+	else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_REQUEUE)
+		fprintf(out, "PreemptMode             = REQUEUE\n");
 
 	if (strcmp(slurm_ctl_conf_ptr->priority_type, "priority/basic") == 0) {
 		fprintf(out, "PriorityType            = %s\n",
diff --git a/src/common/read_config.c b/src/common/read_config.c
index cb915607d10..d8bc858d00b 100644
--- a/src/common/read_config.c
+++ b/src/common/read_config.c
@@ -2126,7 +2126,9 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl)
 			conf->preempt_mode = PREEMPT_MODE_SUSPEND;
 		else if ((strcasecmp(temp_str, "kill") == 0) ||
 			 (strcasecmp(temp_str, "requeue") == 0))
-			conf->preempt_mode = PREEMPT_MODE_KILL;
+			conf->preempt_mode = PREEMPT_MODE_REQUEUE;
+		else if (strcasecmp(temp_str, "checkpoint") == 0)
+			conf->preempt_mode = PREEMPT_MODE_CHECKPOINT;
 		else
 			fatal("Invalid PreemptMode: %s", temp_str);
 	}
diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c
index 5fe5b5012a4..da8f24d55d3 100644
--- a/src/plugins/select/cons_res/select_cons_res.c
+++ b/src/plugins/select/cons_res/select_cons_res.c
@@ -255,7 +255,8 @@ extern bool cr_preemption_enabled(void)
 		uint16_t mode = slurm_get_preempt_mode();
 		if (mode == PREEMPT_MODE_SUSPEND)
 			job_preemption_enabled = true;
-		else if (mode == PREEMPT_MODE_KILL) {
+		else if ((mode == PREEMPT_MODE_CHECKPOINT) ||
+			 (mode == PREEMPT_MODE_REQUEUE)) {
 			job_preemption_enabled = true;
 			job_preemption_killing = true;
 		}
diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c
index 74338d340ae..fcfdd09fc3c 100644
--- a/src/plugins/select/linear/select_linear.c
+++ b/src/plugins/select/linear/select_linear.c
@@ -279,7 +279,8 @@ static inline bool _job_preemption_enabled(void)
 		uint16_t mode = slurm_get_preempt_mode();
 		if (mode == PREEMPT_MODE_SUSPEND)
 			job_preemption_enabled = true;
-		else if (mode == PREEMPT_MODE_KILL) {
+		else if ((mode == PREEMPT_MODE_CHECKPOINT) ||
+			 (mode == PREEMPT_MODE_REQUEUE)) {
 			job_preemption_enabled = true;
 			job_preemption_killing = true;
 		}
diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c
index 4dff2440764..90695359f3d 100644
--- a/src/slurmctld/gang.c
+++ b/src/slurmctld/gang.c
@@ -640,7 +640,7 @@ static void _add_job_to_active(struct job_record *job_ptr,
 	p_ptr->jobs_active += 1;
 }
 
-static void _suspend_job(uint32_t job_id)
+static int _suspend_job(uint32_t job_id)
 {
 	int rc;
 	suspend_msg_t msg;
@@ -653,6 +653,7 @@ static void _suspend_job(uint32_t job_id)
 		error("gang: suspending job %u: %s", 
 		      job_id, slurm_strerror(rc));
 	}
+	return rc;
 }
 
 static void _resume_job(uint32_t job_id)
@@ -670,6 +671,39 @@ static void _resume_job(uint32_t job_id)
 	}
 }
 
+static int _checkpoint_job(uint32_t job_id)
+{
+	int rc;
+	checkpoint_msg_t ckpt_msg;
+
+	/* NOTE: job_checkpoint(VACATE) eventually calls gs_job_fini(),
+	 * so we can't process this request in real-time */
+	memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t));
+	ckpt_msg.op        = CHECK_VACATE;
+	rc = job_checkpoint(&ckpt_msg, 0, -1);
+	if (rc == SLURM_SUCCESS) {
+		info("gang: preempted job %u has been checkpointed",
+		     job_id);
+	}
+
+	return rc;
+}
+
+static int _requeue_job(uint32_t job_id)
+{
+	int rc;
+
+	/* NOTE: job_requeue eventually calls gs_job_fini(),
+	 * so we can't process this request in real-time */
+	rc = job_requeue(0, job_id, -1);
+	if (rc == SLURM_SUCCESS) {
+		info("gang: preempted job %u has been requeued",
+		     job_id);
+	}
+
+	return rc;
+}
+
 void _preempt_job_list_del(void *x)
 {
 	xfree(x);
@@ -684,33 +718,32 @@ static void _preempt_job_queue(uint32_t job_id)
 
 static void _preempt_job_dequeue(void)
 {
-	int rc;
+	int rc = 0;
 	uint32_t job_id, *tmp_id;
+	uint16_t preempt_mode = slurm_get_preempt_mode();
 
 	while ((tmp_id = list_pop(preempt_job_list))) {
 		job_id = *tmp_id;
 		xfree(tmp_id);
 
-		if (slurm_get_preempt_mode() == PREEMPT_MODE_SUSPEND) {
-			_suspend_job(job_id);
-			continue;
-		}
-
-		/* NOTE: job_requeue eventually calls gs_job_fini(),
-		 * so we can't process the request in real-time */
-		rc = job_requeue(0, job_id, -1);
-		if (rc == SLURM_SUCCESS) {
-			info("gang: preempted job %u has been requeued",
-			     job_id);
-			continue;
-		}
-
-		rc = job_signal(job_id, SIGKILL, 0, 0);
-		if (rc == SLURM_SUCCESS)
-			info("gang: preempted job %u has been killed", job_id);
-		else {
-			info("gang: preempted job %u kill failure %s", 
-			     job_id, slurm_strerror(rc));
+		if (preempt_mode == PREEMPT_MODE_SUSPEND)
+			rc = _suspend_job(job_id);
+		else if (preempt_mode == PREEMPT_MODE_REQUEUE)
+			rc = _requeue_job(job_id);
+		else if (preempt_mode == PREEMPT_MODE_CHECKPOINT)
+			rc = _checkpoint_job(job_id);
+		else
+			fatal("Invalid preempt_mode: %u", preempt_mode);
+
+		if (rc != SLURM_SUCCESS) {
+			rc = job_signal(job_id, SIGKILL, 0, 0);
+			if (rc == SLURM_SUCCESS)
+				info("gang: preempted job %u had to be killed", 
+				     job_id);
+			else {
+				info("gang: preempted job %u kill failure %s", 
+				     job_id, slurm_strerror(rc));
+			}
 		}
 	}
 	return;
-- 
GitLab