diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 44bdc85850c2df2ff20eb15e24e1484659e29416..aa57390eefba4d10b5b38260cf638acf5aa35821 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -244,6 +244,21 @@ char *slurm_get_auth_type(void) return auth_type; } +/* slurm_get_checkpoint_type + * returns the checkpoint_type from slurmctld_conf object + * RET char * - checkpoint type, MUST be xfreed by caller + */ +extern char *slurm_get_checkpoint_type(void) +{ + char *checkpoint_type; + slurm_ctl_conf_t *conf; + + conf = slurm_conf_lock(); + checkpoint_type = xstrdup(conf->checkpoint_type); + slurm_conf_unlock(); + return checkpoint_type; +} + /* slurm_get_crypto_type * returns the crypto_type from slurmctld_conf object * RET char * - crypto type, MUST be xfreed by caller diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 488ecbd7e694f7e05bf9d1283cc8d8a3714480d9..d98d6b7d122c6b97fcb49e8994258fb200c8bcfa 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -143,6 +143,12 @@ extern char *slurm_get_auth_type(void); */ extern int slurm_set_auth_type(char *auth_type); +/* slurm_get_checkpoint_type + * returns the checkpoint_type from slurmctld_conf object + * RET char * - checkpoint type, MUST be xfreed by caller + */ +extern char *slurm_get_checkpoint_type(void); + /* slurm_get_crypto_type * returns the crypto_type from slurmctld_conf object * RET char * - crypto type, MUST be xfreed by caller diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 8c62c44b4aba2824e2c3f47b3665a1e39e93e341..4366bebd626ae5867343d809ea4ca9f62c53929c 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -846,9 +846,10 @@ static void *_slurmctld_background(void *no_data) if (difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) { last_timelimit_time = now; - debug2("Performing job time limit check"); + debug2("Performing job time limit and checkpoint test"); lock_slurmctld(job_write_lock); job_time_limit(); + step_checkpoint(); unlock_slurmctld(job_write_lock); } diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index f79c03aca2a76f66b271c6d1cab06a803f77eff7..47dc9a4cfa5839294a17ee4f7261c77560c9984d 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -2435,7 +2435,7 @@ int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid, req_step_msg.name = job_desc_msg->name; req_step_msg.network = NULL; req_step_msg.node_list = NULL; - req_step_msg.ckpt_interval = (uint16_t) NO_VAL; + req_step_msg.ckpt_interval = 0; error_code = step_create(&req_step_msg, &step_rec, false, true); xfree(req_step_msg.node_list); /* may be set by step_create */ diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 1ab49260e7aa19992864b600e8b970d60957267b..806441101944812e6d9088a01f0699c33b20acb4 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -408,6 +408,7 @@ struct step_record { char *host; /* host for srun communications */ uint16_t batch_step; /* 1 if batch job step, 0 otherwise */ uint16_t ckpt_interval; /* checkpoint interval in minutes */ + time_t ckpt_time; /* time of last checkpoint */ switch_jobinfo_t switch_job; /* switch context, opaque */ check_jobinfo_t check_job; /* checkpoint context, opaque */ char *name; /* name of job step */ @@ -1234,6 +1235,9 @@ extern void signal_step_tasks(struct step_record *step_ptr, uint16_t signal); */ extern int slurmctld_shutdown(void); +/* Perform periodic job step checkpoints (per user request) */ +extern void step_checkpoint(void); + /* * step_create - creates a step_record in step_specs->job_id, sets up the * according to the step_specs. diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 858de25d1416274160f0b6ed06f657e63aa7238d..2cd866bc602c99e25d88a391e82ac7f5c0140486 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -806,6 +806,7 @@ step_create(job_step_create_request_msg_t *step_specs, step_ptr->host = xstrdup(step_specs->host); step_ptr->batch_step = batch_step; step_ptr->ckpt_interval = step_specs->ckpt_interval; + step_ptr->ckpt_time = now; step_ptr->exit_code = NO_VAL; /* step's name and network default to job's values if not @@ -1494,6 +1495,8 @@ extern void dump_job_step_state(struct step_record *step_ptr, Buf buffer) pack_time(step_ptr->start_time, buffer); pack_time(step_ptr->pre_sus_time, buffer); + pack_time(step_ptr->ckpt_time, buffer); + packstr(step_ptr->host, buffer); packstr(step_ptr->name, buffer); packstr(step_ptr->network, buffer); @@ -1516,7 +1519,7 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer) uint16_t step_id, cyclic_alloc, name_len, port, batch_step, bit_cnt; uint16_t ckpt_interval; uint32_t exit_code; - time_t start_time, pre_sus_time; + time_t start_time, pre_sus_time, ckpt_time; char *host = NULL; char *name = NULL, *network = NULL, *bit_fmt = NULL; switch_jobinfo_t switch_tmp = NULL; @@ -1536,6 +1539,8 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer) safe_unpack_time(&start_time, buffer); safe_unpack_time(&pre_sus_time, buffer); + safe_unpack_time(&ckpt_time, buffer); + safe_unpackstr_xmalloc(&host, &name_len, buffer); safe_unpackstr_xmalloc(&name, &name_len, buffer); safe_unpackstr_xmalloc(&network, &name_len, buffer); @@ -1576,6 +1581,7 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer) host = NULL; /* re-used, nothing left to free */ step_ptr->start_time = start_time; step_ptr->pre_sus_time = pre_sus_time; + step_ptr->ckpt_time = ckpt_time; slurm_step_layout_destroy(step_ptr->step_layout); step_ptr->step_layout = step_layout; @@ -1617,3 +1623,46 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer) slurm_step_layout_destroy(step_layout); return SLURM_FAILURE; } + +/* Perform periodic job step checkpoints (per user request) */ +extern void step_checkpoint(void) +{ + static int ckpt_run = -1; + time_t now = time(NULL), ckpt_due; + ListIterator job_iterator; + struct job_record *job_ptr; + ListIterator step_iterator; + struct step_record *step_ptr; + + /* Exit if "checkpoint/none" is configured */ + if (ckpt_run == -1) { + char *ckpt_type = slurm_get_checkpoint_type(); + if (strcasecmp(ckpt_type, "checkpoint/none")) + ckpt_run = 1; + else + ckpt_run = 0; + xfree(ckpt_type); + } + if (ckpt_run == 0) + return; + + job_iterator = list_iterator_create(job_list); + while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + if (job_ptr->job_state != JOB_RUNNING) + continue; + step_iterator = list_iterator_create (job_ptr->step_list); + while ((step_ptr = (struct step_record *) + list_next (step_iterator))) { + if (step_ptr->ckpt_interval == 0) + continue; + ckpt_due = step_ptr->ckpt_time + + (step_ptr->ckpt_interval * 60); + if (ckpt_due > now) + continue; +info("checkpoint %u.%u now", job_ptr->job_id, step_ptr->step_id); + step_ptr->ckpt_time = now; + } + list_iterator_destroy (step_iterator); + } + list_iterator_destroy(job_iterator); +}