From 5f9eb24a91c23de77635f7770a579cc36d1388a4 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 28 Sep 2010 23:36:24 +0000 Subject: [PATCH] -- Added new checkpoint mode "scontrol checkpoint requeue <job_id>" to checkpoint and requeue a batch job. Note, there appear to be some problems with checkpoint/blcr which are unrelated to these changes. --- doc/man/man1/scontrol.1 | 41 ++++++++----- slurm/slurm.h.in | 11 ++++ src/api/checkpoint.c | 15 +++++ src/common/checkpoint.c | 4 +- src/common/checkpoint.h | 3 +- src/plugins/checkpoint/aix/checkpoint_aix.c | 1 + src/plugins/checkpoint/blcr/checkpoint_blcr.c | 57 ++++++++++++++++++- src/plugins/checkpoint/ompi/checkpoint_ompi.c | 1 + src/plugins/checkpoint/xlch/checkpoint_xlch.c | 1 + src/scontrol/scontrol.c | 2 +- src/scontrol/update_job.c | 6 ++ src/slurmctld/gang.c | 9 ++- src/slurmctld/job_mgr.c | 3 +- src/slurmctld/proc_req.c | 3 + 14 files changed, 135 insertions(+), 22 deletions(-) diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 1566070965c..e06bd6ba209 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -1,4 +1,4 @@ -.TH SCONTROL "1" "July 2010" "scontrol 2.2" "Slurm components" +.TH SCONTROL "1" "September 2010" "scontrol 2.2" "Slurm components" .SH "NAME" scontrol \- Used view and modify Slurm configuration and state. @@ -75,33 +75,48 @@ which applies to all of its existing steps) or a specific job step (e.g. "<job_id>.<step_id>"). Acceptable values for \fICKPT_OP\fP include: .RS +.TP 12 +\fIable\fP +Test if presently not disabled, report start time if checkpoint in progress .TP -\fIdisable\fP (disable future checkpoints) -.TP -\fIenable\fP (enable future checkpoints) +\fIcreate\fP +Create a checkpoint and continue the job or job step .TP -\fIable\fP (test if presently not disabled, report start time if checkpoint in progress) +\fIdisable\fP +Disable future checkpoints .TP -\fIcreate\fP (create a checkpoint and continue the job step) +\fIenable\fP +Enable future checkpoints .TP -\fIvacate\fP (create a checkpoint and terminate the job step) +\fIerror\fP +Report the result for the last checkpoint request, error code and message .TP -\fIerror\fP (report the result for the last checkpoint request, error code and message) +\fIrestart\fP +Restart execution of the previously checkpointed job or job step .TP -\fIrestart\fP (restart execution of the previously checkpointed job steps) +\fIrequeue\fP +Create a checkpoint and requeue the batch job, combines vacate +and restart operations .TP +\fIvacate\fP +Create a checkpoint and terminate the job or job step +.RE Acceptable values for \fICKPT_OP\fP include: -.TP -\fIMaxWait=<seconds>\fP maximum time for checkpoint to be written. +.RS +.TP 20 +\fIMaxWait=<seconds>\fP +Maximum time for checkpoint to be written. Default value is 10 seconds. Valid with \fIcreate\fP and \fIvacate\fP options only. .TP -\fIImageDir=<directory_name>\fP Location of checkpoint file. +\fIImageDir=<directory_name>\fP +Location of checkpoint file. Valid with \fIcreate\fP, \fIvacate\fP and \fIrestart\fP options only. This value takes precedent over any \-\-checkpoint\-dir value specified at job submission time. .TP -\fIStickToNodes\fP If set, resume job on the same nodes are previously used. +\fIStickToNodes\fP +If set, resume job on the same nodes are previously used. Valid with the \fIrestart\fP option only. .RE diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 41fae1bcd44..3c9ae9c3579 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -3083,6 +3083,17 @@ extern int slurm_checkpoint_create PARAMS((uint32_t job_id, uint16_t max_wait, char *image_dir)); +/* + * slurm_checkpoint_requeue - initiate a checkpoint requests for some job. + * the job will be requeued after the checkpoint operation completes + * IN job_id - job on which to perform operation + * IN max_wait - maximum wait for operation to complete, in seconds + * IN image_dir - directory used to get/put checkpoint images + * RET 0 or a slurm error code + */ +extern int slurm_checkpoint_requeue PARAMS((uint32_t job_id, uint16_t max_wait, + char *image_dir)); + /* * slurm_checkpoint_vacate - initiate a checkpoint requests for some job step. * the job will terminate after the checkpoint operation completes diff --git a/src/api/checkpoint.c b/src/api/checkpoint.c index b571e836ed5..edb0550c0b8 100644 --- a/src/api/checkpoint.c +++ b/src/api/checkpoint.c @@ -179,6 +179,21 @@ extern int slurm_checkpoint_create (uint32_t job_id, uint32_t step_id, image_dir); } +/* + * slurm_checkpoint_requeue - initiate a checkpoint requests for some job. + * the job will be requeued after the checkpoint operation completes + * IN job_id - job on which to perform operation + * IN max_wait - maximum wait for operation to complete, in seconds + * IN image_dir - directory used to get/put checkpoint images + * RET 0 or a slurm error code + */ +extern int slurm_checkpoint_requeue (uint32_t job_id, uint16_t max_wait, + char *image_dir) +{ + return _checkpoint_op (CHECK_REQUEUE, max_wait, job_id, + (uint32_t) SLURM_BATCH_SCRIPT, image_dir); +} + /* * slurm_checkpoint_vacate - initiate a checkpoint requests for some job step. * the job will terminate after the checkpoint operation completes diff --git a/src/common/checkpoint.c b/src/common/checkpoint.c index eee6ba454f6..38557b992b5 100644 --- a/src/common/checkpoint.c +++ b/src/common/checkpoint.c @@ -471,10 +471,10 @@ extern int checkpoint_tasks (uint32_t job_id, uint32_t step_id, if ((ret_list = slurm_send_recv_msgs(nodelist, &req_msg, (wait*1000), false))) { - while((ret_data_info = list_pop(ret_list))) { + while ((ret_data_info = list_pop(ret_list))) { temp_rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); - if(temp_rc) + if (temp_rc) rc = temp_rc; } } else { diff --git a/src/common/checkpoint.h b/src/common/checkpoint.h index b7bfc90fe43..43431c7ddb5 100644 --- a/src/common/checkpoint.h +++ b/src/common/checkpoint.h @@ -56,7 +56,8 @@ enum check_opts { CHECK_VACATE, /* create a checkpoint for this job, * job terminates afterwards */ CHECK_RESTART, /* restart a previously checkpointed job */ - CHECK_ERROR /* get error info */ + CHECK_ERROR, /* get error info */ + CHECK_REQUEUE /* CHECK_VACATTE + CHECK_RESTART */ }; /* opaque data structures - no peeking! */ diff --git a/src/plugins/checkpoint/aix/checkpoint_aix.c b/src/plugins/checkpoint/aix/checkpoint_aix.c index 4916efb26cd..ddfdfb5723b 100644 --- a/src/plugins/checkpoint/aix/checkpoint_aix.c +++ b/src/plugins/checkpoint/aix/checkpoint_aix.c @@ -245,6 +245,7 @@ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id, #endif break; case CHECK_RESTART: + case CHECK_REQUEUE: rc = ESLURM_NOT_SUPPORTED; break; case CHECK_ERROR: diff --git a/src/plugins/checkpoint/blcr/checkpoint_blcr.c b/src/plugins/checkpoint/blcr/checkpoint_blcr.c index f4ca8dafb1f..575df897f9d 100644 --- a/src/plugins/checkpoint/blcr/checkpoint_blcr.c +++ b/src/plugins/checkpoint/blcr/checkpoint_blcr.c @@ -68,10 +68,22 @@ #include "src/common/xstring.h" #include "src/common/xmalloc.h" #include "src/slurmctld/agent.h" +#include "src/slurmctld/acct_policy.h" #include "src/slurmctld/slurmctld.h" #include "src/slurmctld/locks.h" #include "src/slurmd/slurmstepd/slurmstepd_job.h" +/* These are defined here so when we link with something other than + * the slurmctld we will have these symbols defined. They will get + * overwritten when linking with the slurmctld. + */ +#if defined (__APPLE__) +void acct_policy_add_job_submit(struct job_record *job_ptr) + __attribute__((weak_import)); +#else +void acct_policy_add_job_submit(struct job_record *job_ptr); +#endif + #define MAX_PATH_LEN 1024 struct check_job_info { @@ -90,6 +102,7 @@ struct ckpt_req { uint16_t wait; char *image_dir; char *nodelist; + uint32_t op; uint16_t sig_done; }; @@ -213,6 +226,12 @@ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id, case CHECK_ENABLE: check_ptr->disabled--; break; + case CHECK_REQUEUE: + if (step_id != SLURM_BATCH_SCRIPT) { + rc = ESLURM_NOT_SUPPORTED; + break; + } + /* no break */ case CHECK_VACATE: done_sig = SIGTERM; /* no break */ @@ -244,6 +263,7 @@ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id, req_ptr->image_dir = xstrdup(image_dir); req_ptr->nodelist = xstrdup(nodelist); req_ptr->sig_done = done_sig; + req_ptr->op = op; slurm_attr_init(&attr); if (pthread_attr_setdetachstate(&attr, @@ -552,6 +572,31 @@ static void _send_sig(uint32_t job_id, uint32_t step_id, uint16_t signal, agent_queue_request(agent_args); } +static void _requeue_when_finished(uint32_t job_id) +{ + /* Locks: read job */ + slurmctld_lock_t job_write_lock = { + NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; + struct job_record *job_ptr; + + while (1) { + lock_slurmctld(job_write_lock); + job_ptr = find_job_record(job_id); + if (IS_JOB_FINISHED(job_ptr)) { + job_ptr->job_state = JOB_PENDING; + job_ptr->details->submit_time = time(NULL); + job_ptr->restart_cnt++; + /* Since the job completion logger + * removes the submit we need to add it again. */ + acct_policy_add_job_submit(job_ptr); + unlock_slurmctld(job_write_lock); + break; + } else { + unlock_slurmctld(job_write_lock); + sleep(1); + } + } +} /* Checkpoint processing pthread * Never returns, but is cancelled on plugin termiantion */ @@ -575,14 +620,20 @@ static void *_ckpt_agent_thr(void *arg) ckpt_agent_count ++; slurm_mutex_unlock(&ckpt_agent_mutex); - debug3("checkpoint/blcr: sending checkpoint tasks request to %u.%u", - req->job_id, req->step_id); + debug3("checkpoint/blcr: sending checkpoint tasks request %u to %u.%u", + req->op, req->job_id, req->step_id); rc = checkpoint_tasks(req->job_id, req->step_id, req->begin_time, req->image_dir, req->wait, req->nodelist); + if (rc != SLURM_SUCCESS) { + error("checkpoint/blcr: error on checkpoint request %u to " + "%u.%u: %s", req->op, req->job_id, req->step_id, + slurm_strerror(rc)); + } + if (req->op == CHECK_REQUEUE) + _requeue_when_finished(req->job_id); lock_slurmctld(job_write_lock); - job_ptr = find_job_record(req->job_id); if (!job_ptr) { error("_ckpt_agent_thr: job finished"); diff --git a/src/plugins/checkpoint/ompi/checkpoint_ompi.c b/src/plugins/checkpoint/ompi/checkpoint_ompi.c index b592eed50b0..26f49ea06c2 100644 --- a/src/plugins/checkpoint/ompi/checkpoint_ompi.c +++ b/src/plugins/checkpoint/ompi/checkpoint_ompi.c @@ -169,6 +169,7 @@ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id, rc = _ckpt_step(step_ptr, data, 1); break; case CHECK_RESTART: + case CHECK_REQUEUE: /* Lots of work is required in Slurm to restart a * checkpointed job. For now the user can submit a * new job and execute "ompi_restart <snapshot>" */ diff --git a/src/plugins/checkpoint/xlch/checkpoint_xlch.c b/src/plugins/checkpoint/xlch/checkpoint_xlch.c index c5beae6da02..5562a74817d 100644 --- a/src/plugins/checkpoint/xlch/checkpoint_xlch.c +++ b/src/plugins/checkpoint/xlch/checkpoint_xlch.c @@ -253,6 +253,7 @@ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id, rc = _step_ckpt(step_ptr, data, image_dir, SIGKILL); break; case CHECK_RESTART: + case CHECK_REQUEUE: rc = ESLURM_NOT_SUPPORTED; break; case CHECK_ERROR: diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index 65773d84406..1d67332479b 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -1615,7 +1615,7 @@ scontrol [<OPTION>] [<COMMAND>] \n\ (Bluegene systems only) \n\ \n\ <CH_OP> identify checkpoint operations and may be \"able\", \"disable\", \n\ - \"enable\", \"create\", \"vacate\", \"restart\", or \"error\". \n\ + \"enable\", \"create\", \"vacate\", \"requeue\", \"restart\", or \"error\"\n\ Additional options include \"ImageDir=<dir>\", \"MaxWait=<seconds>\" and \n\ \"StickToNodes\" \n\ \n\ diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index d210192f9fb..4fb0c8e1736 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -121,6 +121,12 @@ scontrol_checkpoint(char *op, char *job_step_id_str, int argc, char *argv[]) rc = slurm_checkpoint_create (job_id, step_id, max_wait, image_dir); + } else if (strncasecmp(op, "requeue", MAX(oplen, 2)) == 0) { + if (_parse_checkpoint_args(argc, argv, &max_wait, &image_dir)){ + return 0; + } + rc = slurm_checkpoint_requeue (job_id, max_wait, image_dir); + } else if (strncasecmp(op, "vacate", MAX(oplen, 2)) == 0) { if (_parse_checkpoint_args(argc, argv, &max_wait, &image_dir)){ return 0; diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c index e64f2e95b92..ac6865456c7 100644 --- a/src/slurmctld/gang.c +++ b/src/slurmctld/gang.c @@ -634,10 +634,17 @@ static void _preempt_job_dequeue(void) } else if (preempt_mode == PREEMPT_MODE_CHECKPOINT) { checkpoint_msg_t ckpt_msg; memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t)); - ckpt_msg.op = CHECK_VACATE; + ckpt_msg.op = CHECK_REQUEUE; ckpt_msg.job_id = job_ptr->job_id; rc = job_checkpoint(&ckpt_msg, 0, -1, (uint16_t)NO_VAL); + if (rc == ESLURM_NOT_SUPPORTED) { + memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t)); + ckpt_msg.op = CHECK_VACATE; + ckpt_msg.job_id = job_ptr->job_id; + rc = job_checkpoint(&ckpt_msg, 0, -1, + (uint16_t)NO_VAL); + } if (rc == SLURM_SUCCESS) { info("preempted job %u has been checkpointed", job_ptr->job_id); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index a6baea8ed32..9425fdcf15b 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -8953,7 +8953,8 @@ extern int job_checkpoint(checkpoint_msg_t *ckpt_ptr, uid_t uid, memset((void *)&resp_data, 0, sizeof(checkpoint_resp_msg_t)); if (job_ptr->batch_flag) { /* operate on batch job */ - if ((ckpt_ptr->op == CHECK_CREATE) || + if ((ckpt_ptr->op == CHECK_CREATE) || + (ckpt_ptr->op == CHECK_REQUEUE) || (ckpt_ptr->op == CHECK_VACATE)) { if (job_ptr->details == NULL) { rc = ESLURM_DISABLED; diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 4f01366e278..a1195ac0b09 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -3121,6 +3121,9 @@ inline static void _slurm_rpc_checkpoint(slurm_msg_t * msg) case CHECK_ERROR: op = "error"; break; + case CHECK_REQUEUE: + op = "requeue"; + break; case CHECK_RESTART: op = "restart"; break; -- GitLab