From 1437dbdb3ca671115d62c14d6867c93a9259e0e1 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 23 Aug 2007 19:18:08 +0000
Subject: [PATCH] Add function to get checkpoint type and scan steps to perform
 checkpoint as required, still need to perform actual checkpoint operation

---
 src/common/slurm_protocol_api.c | 15 ++++++++++
 src/common/slurm_protocol_api.h |  6 ++++
 src/slurmctld/controller.c      |  3 +-
 src/slurmctld/proc_req.c        |  2 +-
 src/slurmctld/slurmctld.h       |  4 +++
 src/slurmctld/step_mgr.c        | 51 ++++++++++++++++++++++++++++++++-
 6 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index 44bdc85850c..aa57390eefb 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -244,6 +244,21 @@ char *slurm_get_auth_type(void)
 	return auth_type;
 }
 
+/* slurm_get_checkpoint_type
+ * returns the checkpoint_type from slurmctld_conf object
+ * RET char *    - checkpoint type, MUST be xfreed by caller
+ */
+extern char *slurm_get_checkpoint_type(void)
+{
+	char *checkpoint_type;
+	slurm_ctl_conf_t *conf;
+
+	conf = slurm_conf_lock();
+	checkpoint_type = xstrdup(conf->checkpoint_type);
+	slurm_conf_unlock();
+	return checkpoint_type;
+}
+
 /* slurm_get_crypto_type
  * returns the crypto_type from slurmctld_conf object
  * RET char *    - crypto type, MUST be xfreed by caller
diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h
index 488ecbd7e69..d98d6b7d122 100644
--- a/src/common/slurm_protocol_api.h
+++ b/src/common/slurm_protocol_api.h
@@ -143,6 +143,12 @@ extern char *slurm_get_auth_type(void);
  */
 extern int slurm_set_auth_type(char *auth_type);
 
+/* slurm_get_checkpoint_type
+ * returns the checkpoint_type from slurmctld_conf object
+ * RET char *    - checkpoint type, MUST be xfreed by caller
+ */
+extern char *slurm_get_checkpoint_type(void);
+
 /* slurm_get_crypto_type
  * returns the crypto_type from slurmctld_conf object
  * RET char *    - crypto type, MUST be xfreed by caller
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 8c62c44b4ab..4366bebd626 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -846,9 +846,10 @@ static void *_slurmctld_background(void *no_data)
 
 		if (difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) {
 			last_timelimit_time = now;
-			debug2("Performing job time limit check");
+			debug2("Performing job time limit and checkpoint test");
 			lock_slurmctld(job_write_lock);
 			job_time_limit();
+			step_checkpoint();
 			unlock_slurmctld(job_write_lock);
 		}
 
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index f79c03aca2a..47dc9a4cfa5 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -2435,7 +2435,7 @@ int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid,
 	req_step_msg.name = job_desc_msg->name;
 	req_step_msg.network = NULL;
 	req_step_msg.node_list = NULL;
-	req_step_msg.ckpt_interval = (uint16_t) NO_VAL;
+	req_step_msg.ckpt_interval = 0;
 
 	error_code = step_create(&req_step_msg, &step_rec, false, true);
 	xfree(req_step_msg.node_list);	/* may be set by step_create */
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 1ab49260e7a..80644110194 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -408,6 +408,7 @@ struct 	step_record {
 	char *host;			/* host for srun communications */
 	uint16_t batch_step;		/* 1 if batch job step, 0 otherwise */
 	uint16_t ckpt_interval;		/* checkpoint interval in minutes */
+	time_t ckpt_time;		/* time of last checkpoint */
 	switch_jobinfo_t switch_job;	/* switch context, opaque */
 	check_jobinfo_t check_job;	/* checkpoint context, opaque */
 	char *name;			/* name of job step */
@@ -1234,6 +1235,9 @@ extern void signal_step_tasks(struct step_record *step_ptr, uint16_t signal);
  */
 extern int slurmctld_shutdown(void);
 
+/* Perform periodic job step checkpoints (per user request) */
+extern void step_checkpoint(void);
+
 /*
  * step_create - creates a step_record in step_specs->job_id, sets up the
  *	according to the step_specs.
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 858de25d141..2cd866bc602 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -806,6 +806,7 @@ step_create(job_step_create_request_msg_t *step_specs,
 	step_ptr->host = xstrdup(step_specs->host);
 	step_ptr->batch_step = batch_step;
 	step_ptr->ckpt_interval = step_specs->ckpt_interval;
+	step_ptr->ckpt_time = now;
 	step_ptr->exit_code = NO_VAL;
 
 	/* step's name and network default to job's values if not 
@@ -1494,6 +1495,8 @@ extern void dump_job_step_state(struct step_record *step_ptr, Buf buffer)
 
 	pack_time(step_ptr->start_time, buffer);
 	pack_time(step_ptr->pre_sus_time, buffer);
+	pack_time(step_ptr->ckpt_time, buffer);
+
 	packstr(step_ptr->host,  buffer);
 	packstr(step_ptr->name, buffer);
 	packstr(step_ptr->network, buffer);
@@ -1516,7 +1519,7 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer)
 	uint16_t step_id, cyclic_alloc, name_len, port, batch_step, bit_cnt;
 	uint16_t ckpt_interval;
 	uint32_t exit_code;
-	time_t start_time, pre_sus_time;
+	time_t start_time, pre_sus_time, ckpt_time;
 	char *host = NULL;
 	char *name = NULL, *network = NULL, *bit_fmt = NULL;
 	switch_jobinfo_t switch_tmp = NULL;
@@ -1536,6 +1539,8 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer)
 	
 	safe_unpack_time(&start_time, buffer);
 	safe_unpack_time(&pre_sus_time, buffer);
+	safe_unpack_time(&ckpt_time, buffer);
+
 	safe_unpackstr_xmalloc(&host, &name_len, buffer);
 	safe_unpackstr_xmalloc(&name, &name_len, buffer);
 	safe_unpackstr_xmalloc(&network, &name_len, buffer);
@@ -1576,6 +1581,7 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer)
 	host                   = NULL;  /* re-used, nothing left to free */
 	step_ptr->start_time   = start_time;
 	step_ptr->pre_sus_time = pre_sus_time;
+	step_ptr->ckpt_time    = ckpt_time;
 
 	slurm_step_layout_destroy(step_ptr->step_layout);
 	step_ptr->step_layout  = step_layout;
@@ -1617,3 +1623,46 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer)
 	slurm_step_layout_destroy(step_layout);
 	return SLURM_FAILURE;
 }
+
+/* Perform periodic job step checkpoints (per user request) */
+extern void step_checkpoint(void)
+{
+	static int ckpt_run = -1;
+	time_t now = time(NULL), ckpt_due;
+	ListIterator job_iterator;
+	struct job_record *job_ptr;
+	ListIterator step_iterator;
+	struct step_record *step_ptr;
+
+	/* Exit if "checkpoint/none" is configured */
+	if (ckpt_run == -1) {
+		char *ckpt_type = slurm_get_checkpoint_type();
+		if (strcasecmp(ckpt_type, "checkpoint/none"))
+			ckpt_run = 1;
+		else
+			ckpt_run = 0;
+		xfree(ckpt_type);
+	}
+	if (ckpt_run == 0)
+		return;
+
+	job_iterator = list_iterator_create(job_list);
+	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
+		if (job_ptr->job_state != JOB_RUNNING)
+			continue;
+		step_iterator = list_iterator_create (job_ptr->step_list);
+		while ((step_ptr = (struct step_record *) 
+				list_next (step_iterator))) {
+			if (step_ptr->ckpt_interval == 0)
+				continue;
+			ckpt_due = step_ptr->ckpt_time +
+				(step_ptr->ckpt_interval * 60);
+			if (ckpt_due > now) 
+				continue;
+info("checkpoint %u.%u now", job_ptr->job_id, step_ptr->step_id);
+			step_ptr->ckpt_time = now;
+		}
+		list_iterator_destroy (step_iterator);
+	}
+	list_iterator_destroy(job_iterator);
+}
-- 
GitLab