From 5f9eb24a91c23de77635f7770a579cc36d1388a4 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 28 Sep 2010 23:36:24 +0000
Subject: [PATCH]  -- Added new checkpoint mode "scontrol checkpoint requeue
 <job_id>" to     checkpoint and requeue a batch job. Note, there appear to be
 some     problems with checkpoint/blcr which are unrelated to these changes.

---
 doc/man/man1/scontrol.1                       | 41 ++++++++-----
 slurm/slurm.h.in                              | 11 ++++
 src/api/checkpoint.c                          | 15 +++++
 src/common/checkpoint.c                       |  4 +-
 src/common/checkpoint.h                       |  3 +-
 src/plugins/checkpoint/aix/checkpoint_aix.c   |  1 +
 src/plugins/checkpoint/blcr/checkpoint_blcr.c | 57 ++++++++++++++++++-
 src/plugins/checkpoint/ompi/checkpoint_ompi.c |  1 +
 src/plugins/checkpoint/xlch/checkpoint_xlch.c |  1 +
 src/scontrol/scontrol.c                       |  2 +-
 src/scontrol/update_job.c                     |  6 ++
 src/slurmctld/gang.c                          |  9 ++-
 src/slurmctld/job_mgr.c                       |  3 +-
 src/slurmctld/proc_req.c                      |  3 +
 14 files changed, 135 insertions(+), 22 deletions(-)

diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1
index 1566070965c..e06bd6ba209 100644
--- a/doc/man/man1/scontrol.1
+++ b/doc/man/man1/scontrol.1
@@ -1,4 +1,4 @@
-.TH SCONTROL "1" "July 2010" "scontrol 2.2" "Slurm components"
+.TH SCONTROL "1" "September 2010" "scontrol 2.2" "Slurm components"
 
 .SH "NAME"
 scontrol \- Used view and modify Slurm configuration and state.
@@ -75,33 +75,48 @@ which applies to all of its existing steps)
 or a specific job step (e.g. "<job_id>.<step_id>").
 Acceptable values for \fICKPT_OP\fP include:
 .RS
+.TP 12
+\fIable\fP
+Test if presently not disabled, report start time if checkpoint in progress
 .TP
-\fIdisable\fP (disable future checkpoints)
-.TP
-\fIenable\fP (enable future checkpoints)
+\fIcreate\fP
+Create a checkpoint and continue the job or job step
 .TP
-\fIable\fP (test if presently not disabled, report start time if checkpoint in progress)
+\fIdisable\fP
+Disable future checkpoints
 .TP
-\fIcreate\fP (create a checkpoint and continue the job step)
+\fIenable\fP
+Enable future checkpoints
 .TP
-\fIvacate\fP (create a checkpoint and terminate the job step)
+\fIerror\fP
+Report the result for the last checkpoint request, error code and message
 .TP
-\fIerror\fP (report the result for the last checkpoint request, error code and message)
+\fIrestart\fP
+Restart execution of the previously checkpointed job or job step
 .TP
-\fIrestart\fP (restart execution of the previously checkpointed job steps)
+\fIrequeue\fP
+Create a checkpoint and requeue the batch job, combines vacate
+and restart operations
 .TP
+\fIvacate\fP
+Create a checkpoint and terminate the job or job step
+.RE
 Acceptable values for \fICKPT_OP\fP include:
-.TP
-\fIMaxWait=<seconds>\fP maximum time for checkpoint to be written.
+.RS
+.TP 20
+\fIMaxWait=<seconds>\fP
+Maximum time for checkpoint to be written.
 Default value is 10 seconds.
 Valid with \fIcreate\fP and \fIvacate\fP options only.
 .TP
-\fIImageDir=<directory_name>\fP Location of checkpoint file.
+\fIImageDir=<directory_name>\fP
+Location of checkpoint file.
 Valid with \fIcreate\fP, \fIvacate\fP and \fIrestart\fP options only.
 This value takes precedent over any \-\-checkpoint\-dir value specified
 at job submission time.
 .TP
-\fIStickToNodes\fP If set, resume job on the same nodes are previously used.
+\fIStickToNodes\fP
+If set, resume job on the same nodes are previously used.
 Valid with the \fIrestart\fP option only.
 .RE
 
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 41fae1bcd44..3c9ae9c3579 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -3083,6 +3083,17 @@ extern int slurm_checkpoint_create PARAMS((uint32_t job_id,
 					   uint16_t max_wait,
 					   char *image_dir));
 
+/*
+ * slurm_checkpoint_requeue - initiate a checkpoint requests for some job.
+ *	the job will be requeued after the checkpoint operation completes
+ * IN job_id  - job on which to perform operation
+ * IN max_wait - maximum wait for operation to complete, in seconds
+ * IN image_dir - directory used to get/put checkpoint images
+ * RET 0 or a slurm error code
+ */
+extern int slurm_checkpoint_requeue PARAMS((uint32_t job_id, uint16_t max_wait,
+					    char *image_dir));
+
 /*
  * slurm_checkpoint_vacate - initiate a checkpoint requests for some job step.
  *	the job will terminate after the checkpoint operation completes
diff --git a/src/api/checkpoint.c b/src/api/checkpoint.c
index b571e836ed5..edb0550c0b8 100644
--- a/src/api/checkpoint.c
+++ b/src/api/checkpoint.c
@@ -179,6 +179,21 @@ extern int slurm_checkpoint_create (uint32_t job_id, uint32_t step_id,
 			       image_dir);
 }
 
+/*
+ * slurm_checkpoint_requeue - initiate a checkpoint requests for some job.
+ *	the job will be requeued after the checkpoint operation completes
+ * IN job_id  - job on which to perform operation
+ * IN max_wait - maximum wait for operation to complete, in seconds
+ * IN image_dir - directory used to get/put checkpoint images
+ * RET 0 or a slurm error code
+ */
+extern int slurm_checkpoint_requeue (uint32_t job_id, uint16_t max_wait,
+				     char *image_dir)
+{
+	return _checkpoint_op (CHECK_REQUEUE, max_wait, job_id,
+			       (uint32_t) SLURM_BATCH_SCRIPT, image_dir);
+}
+
 /*
  * slurm_checkpoint_vacate - initiate a checkpoint requests for some job step.
  *	the job will terminate after the checkpoint operation completes
diff --git a/src/common/checkpoint.c b/src/common/checkpoint.c
index eee6ba454f6..38557b992b5 100644
--- a/src/common/checkpoint.c
+++ b/src/common/checkpoint.c
@@ -471,10 +471,10 @@ extern int checkpoint_tasks (uint32_t job_id, uint32_t step_id,
 
 	if ((ret_list = slurm_send_recv_msgs(nodelist, &req_msg, (wait*1000),
 					     false))) {
-		while((ret_data_info = list_pop(ret_list))) {
+		while ((ret_data_info = list_pop(ret_list))) {
                         temp_rc = slurm_get_return_code(ret_data_info->type,
                                                         ret_data_info->data);
-                        if(temp_rc)
+                        if (temp_rc)
                                 rc = temp_rc;
                 }
 	} else {
diff --git a/src/common/checkpoint.h b/src/common/checkpoint.h
index b7bfc90fe43..43431c7ddb5 100644
--- a/src/common/checkpoint.h
+++ b/src/common/checkpoint.h
@@ -56,7 +56,8 @@ enum check_opts {
 	CHECK_VACATE,		/* create a checkpoint for this job,
 				 * job terminates afterwards */
 	CHECK_RESTART,		/* restart a previously checkpointed job */
-	CHECK_ERROR		/* get error info */
+	CHECK_ERROR,		/* get error info */
+	CHECK_REQUEUE		/* CHECK_VACATTE + CHECK_RESTART */
 };
 
 /* opaque data structures - no peeking! */
diff --git a/src/plugins/checkpoint/aix/checkpoint_aix.c b/src/plugins/checkpoint/aix/checkpoint_aix.c
index 4916efb26cd..ddfdfb5723b 100644
--- a/src/plugins/checkpoint/aix/checkpoint_aix.c
+++ b/src/plugins/checkpoint/aix/checkpoint_aix.c
@@ -245,6 +245,7 @@ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id,
 #endif
 			break;
 		case CHECK_RESTART:
+		case CHECK_REQUEUE:
 			rc = ESLURM_NOT_SUPPORTED;
 			break;
 		case CHECK_ERROR:
diff --git a/src/plugins/checkpoint/blcr/checkpoint_blcr.c b/src/plugins/checkpoint/blcr/checkpoint_blcr.c
index f4ca8dafb1f..575df897f9d 100644
--- a/src/plugins/checkpoint/blcr/checkpoint_blcr.c
+++ b/src/plugins/checkpoint/blcr/checkpoint_blcr.c
@@ -68,10 +68,22 @@
 #include "src/common/xstring.h"
 #include "src/common/xmalloc.h"
 #include "src/slurmctld/agent.h"
+#include "src/slurmctld/acct_policy.h"
 #include "src/slurmctld/slurmctld.h"
 #include "src/slurmctld/locks.h"
 #include "src/slurmd/slurmstepd/slurmstepd_job.h"
 
+/* These are defined here so when we link with something other than
+ * the slurmctld we will have these symbols defined.  They will get
+ * overwritten when linking with the slurmctld.
+ */
+#if defined (__APPLE__)
+void acct_policy_add_job_submit(struct job_record *job_ptr)
+	__attribute__((weak_import));
+#else
+void acct_policy_add_job_submit(struct job_record *job_ptr);
+#endif
+
 #define MAX_PATH_LEN 1024
 
 struct check_job_info {
@@ -90,6 +102,7 @@ struct ckpt_req {
 	uint16_t wait;
 	char *image_dir;
 	char *nodelist;
+	uint32_t op;
 	uint16_t sig_done;
 };
 
@@ -213,6 +226,12 @@ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id,
 	case CHECK_ENABLE:
 		check_ptr->disabled--;
 		break;
+	case CHECK_REQUEUE:
+		if (step_id != SLURM_BATCH_SCRIPT) {
+			rc = ESLURM_NOT_SUPPORTED;
+			break;
+		}
+		/* no break */
 	case CHECK_VACATE:
 		done_sig = SIGTERM;
 		/* no break */
@@ -244,6 +263,7 @@ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id,
 		req_ptr->image_dir = xstrdup(image_dir);
 		req_ptr->nodelist = xstrdup(nodelist);
 		req_ptr->sig_done = done_sig;
+		req_ptr->op = op;
 
 		slurm_attr_init(&attr);
 		if (pthread_attr_setdetachstate(&attr,
@@ -552,6 +572,31 @@ static void _send_sig(uint32_t job_id, uint32_t step_id, uint16_t signal,
 	agent_queue_request(agent_args);
 }
 
+static void _requeue_when_finished(uint32_t job_id)
+{
+	/* Locks: read job */
+	slurmctld_lock_t job_write_lock = {
+		NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
+	struct job_record *job_ptr;
+
+	while (1) {
+		lock_slurmctld(job_write_lock);
+		job_ptr = find_job_record(job_id);
+		if (IS_JOB_FINISHED(job_ptr)) {
+			job_ptr->job_state = JOB_PENDING;
+			job_ptr->details->submit_time = time(NULL);
+			job_ptr->restart_cnt++;
+			/* Since the job completion logger
+			 * removes the submit we need to add it again. */
+			acct_policy_add_job_submit(job_ptr);
+			unlock_slurmctld(job_write_lock);
+			break;
+		} else {
+			unlock_slurmctld(job_write_lock);
+			sleep(1);
+		}
+	}
+}
 
 /* Checkpoint processing pthread
  * Never returns, but is cancelled on plugin termiantion */
@@ -575,14 +620,20 @@ static void *_ckpt_agent_thr(void *arg)
 	ckpt_agent_count ++;
 	slurm_mutex_unlock(&ckpt_agent_mutex);
 
-	debug3("checkpoint/blcr: sending checkpoint tasks request to %u.%u",
-	       req->job_id, req->step_id);
+	debug3("checkpoint/blcr: sending checkpoint tasks request %u to %u.%u",
+	       req->op, req->job_id, req->step_id);
 
 	rc = checkpoint_tasks(req->job_id, req->step_id, req->begin_time,
 			      req->image_dir, req->wait, req->nodelist);
+	if (rc != SLURM_SUCCESS) {
+		error("checkpoint/blcr: error on checkpoint request %u to "
+		      "%u.%u: %s", req->op, req->job_id, req->step_id,
+		      slurm_strerror(rc));
+	}
+	if (req->op == CHECK_REQUEUE)
+		_requeue_when_finished(req->job_id);
 
 	lock_slurmctld(job_write_lock);
-
 	job_ptr = find_job_record(req->job_id);
 	if (!job_ptr) {
 		error("_ckpt_agent_thr: job finished");
diff --git a/src/plugins/checkpoint/ompi/checkpoint_ompi.c b/src/plugins/checkpoint/ompi/checkpoint_ompi.c
index b592eed50b0..26f49ea06c2 100644
--- a/src/plugins/checkpoint/ompi/checkpoint_ompi.c
+++ b/src/plugins/checkpoint/ompi/checkpoint_ompi.c
@@ -169,6 +169,7 @@ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id,
 			rc = _ckpt_step(step_ptr, data, 1);
 			break;
 		case CHECK_RESTART:
+		case CHECK_REQUEUE:
 			/* Lots of work is required in Slurm to restart a
 			 * checkpointed job. For now the user can submit a
 			 * new job and execute "ompi_restart <snapshot>" */
diff --git a/src/plugins/checkpoint/xlch/checkpoint_xlch.c b/src/plugins/checkpoint/xlch/checkpoint_xlch.c
index c5beae6da02..5562a74817d 100644
--- a/src/plugins/checkpoint/xlch/checkpoint_xlch.c
+++ b/src/plugins/checkpoint/xlch/checkpoint_xlch.c
@@ -253,6 +253,7 @@ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id,
 			rc = _step_ckpt(step_ptr, data, image_dir, SIGKILL);
 			break;
 		case CHECK_RESTART:
+		case CHECK_REQUEUE:
 			rc = ESLURM_NOT_SUPPORTED;
 			break;
 		case CHECK_ERROR:
diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c
index 65773d84406..1d67332479b 100644
--- a/src/scontrol/scontrol.c
+++ b/src/scontrol/scontrol.c
@@ -1615,7 +1615,7 @@ scontrol [<OPTION>] [<COMMAND>]                                            \n\
   (Bluegene systems only)                                                  \n\
                                                                            \n\
   <CH_OP> identify checkpoint operations and may be \"able\", \"disable\", \n\
-  \"enable\", \"create\", \"vacate\", \"restart\", or \"error\".           \n\
+  \"enable\", \"create\", \"vacate\", \"requeue\", \"restart\", or \"error\"\n\
   Additional options include \"ImageDir=<dir>\", \"MaxWait=<seconds>\" and \n\
   \"StickToNodes\"   \n\
                                                                            \n\
diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c
index d210192f9fb..4fb0c8e1736 100644
--- a/src/scontrol/update_job.c
+++ b/src/scontrol/update_job.c
@@ -121,6 +121,12 @@ scontrol_checkpoint(char *op, char *job_step_id_str, int argc, char *argv[])
 		rc = slurm_checkpoint_create (job_id, step_id, max_wait,
 					      image_dir);
 
+	} else if (strncasecmp(op, "requeue", MAX(oplen, 2)) == 0) {
+		if (_parse_checkpoint_args(argc, argv, &max_wait, &image_dir)){
+			return 0;
+		}
+		rc = slurm_checkpoint_requeue (job_id, max_wait, image_dir);
+
 	} else if (strncasecmp(op, "vacate", MAX(oplen, 2)) == 0) {
 		if (_parse_checkpoint_args(argc, argv, &max_wait, &image_dir)){
 			return 0;
diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c
index e64f2e95b92..ac6865456c7 100644
--- a/src/slurmctld/gang.c
+++ b/src/slurmctld/gang.c
@@ -634,10 +634,17 @@ static void _preempt_job_dequeue(void)
 		} else if (preempt_mode == PREEMPT_MODE_CHECKPOINT) {
 			checkpoint_msg_t ckpt_msg;
 			memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t));
-			ckpt_msg.op	   = CHECK_VACATE;
+			ckpt_msg.op	   = CHECK_REQUEUE;
 			ckpt_msg.job_id    = job_ptr->job_id;
 			rc = job_checkpoint(&ckpt_msg, 0, -1,
 					    (uint16_t)NO_VAL);
+			if (rc == ESLURM_NOT_SUPPORTED) {
+				memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t));
+				ckpt_msg.op	   = CHECK_VACATE;
+				ckpt_msg.job_id    = job_ptr->job_id;
+				rc = job_checkpoint(&ckpt_msg, 0, -1,
+						    (uint16_t)NO_VAL);
+			}
 			if (rc == SLURM_SUCCESS) {
 				info("preempted job %u has been checkpointed",
 				     job_ptr->job_id);
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index a6baea8ed32..9425fdcf15b 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -8953,7 +8953,8 @@ extern int job_checkpoint(checkpoint_msg_t *ckpt_ptr, uid_t uid,
 	memset((void *)&resp_data, 0, sizeof(checkpoint_resp_msg_t));
 
 	if (job_ptr->batch_flag) { /* operate on batch job */
-		if ((ckpt_ptr->op == CHECK_CREATE) ||
+		if ((ckpt_ptr->op == CHECK_CREATE)  ||
+		    (ckpt_ptr->op == CHECK_REQUEUE) ||
 		    (ckpt_ptr->op == CHECK_VACATE)) {
 			if (job_ptr->details == NULL) {
 				rc = ESLURM_DISABLED;
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 4f01366e278..a1195ac0b09 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -3121,6 +3121,9 @@ inline static void  _slurm_rpc_checkpoint(slurm_msg_t * msg)
 	case CHECK_ERROR:
 		op = "error";
 		break;
+	case CHECK_REQUEUE:
+		op = "requeue";
+		break;
 	case CHECK_RESTART:
 		op = "restart";
 		break;
-- 
GitLab