From f8ae6cc4e625f356d4e220e2a24a2eba9d4943c0 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 20 Jul 2009 21:02:40 +0000 Subject: [PATCH] add a PreemptMode value of cancel --- NEWS | 6 ++--- doc/html/preempt.shtml | 22 +++++++++++-------- doc/man/man5/slurm.conf.5 | 11 ++++++---- slurm/slurm.h.in | 1 + src/api/config_info.c | 6 +++-- src/common/read_config.c | 12 +++++----- src/plugins/select/cons_res/select_cons_res.c | 3 ++- src/plugins/select/linear/select_linear.c | 3 ++- src/slurmctld/gang.c | 14 +++++++++++- 9 files changed, 52 insertions(+), 26 deletions(-) diff --git a/NEWS b/NEWS index de24533f551..686a8550b3d 100644 --- a/NEWS +++ b/NEWS @@ -6,11 +6,11 @@ documents those changes that are of interest to users and admins. -- Removed sched/gang plugin and moved the logic directly into the slurmctld daemon so that job preemption and gang scheduling can be used with the sched/backfill plugin. Added configuration parameter: - PreemptMode=off|suspend|kill|requeue|checkpoint + PreemptMode=off|suspend|cancel|requeue|checkpoint to enable/disable the job preemption logic (disabled by default). - (NOTE: There are some problems with memory management which could prevent a + (NOTE: There are some problems with memory management which could prevent a job from starting when memory would be freed by a job being requeued or - otherwise removed) + otherwise removed, these are being worked on) -- If the --partition option is used with the sinfo or squeue command then print information about even hidden partitions. -- Replaced misc cpu allocation members in job_info_t with select_job_res_t diff --git a/doc/html/preempt.shtml b/doc/html/preempt.shtml index a1818099662..5b2d4843ce2 100644 --- a/doc/html/preempt.shtml +++ b/doc/html/preempt.shtml @@ -76,15 +76,19 @@ limits, it will be canceled in order to prevent it from adversely effecting other jobs sharing the same resources. </LI> <LI> -<B>PreemptMode</B>: Configure to <I>CHECKPOINT</I>, <I>SUSPEND</I> or -<I>REQUEUE</I> depending on the desired action for low priority jobs. -A value of <I>CHECKPOINT</I> will checkpoint (if possible) or kill low -priority jobs. -Checkpointed jobs are not automatically restarted. -A value of <I>REQUEUE</I> will requeue (if possible) or kill low priority jobs. -Requeued jobs are permitted to be restarted on different resources. -A value of <I>SUSPEND</I> will suspend and automatically resume the low +<B>PreemptMode</B>: Configure to <I>CANCEL</I>, <I>CHECKPOINT</I>, +<I>SUSPEND</I> or <I>REQUEUE</I> depending on the desired action for low priority jobs. +<UL> +<LI>A value of <I>CANCEL</I> will always cancel the job.</LI> +<LI>A value of <I>CHECKPOINT</I> will checkpoint (if possible) or kill low +priority jobs.</LI> +Checkpointed jobs are not automatically restarted. +<LI>A value of <I>REQUEUE</I> will requeue (if possible) or kill low priority +jobs. Requeued jobs are permitted to be restarted on different resources.</LI> +<LI>A value of <I>SUSPEND</I> will suspend and automatically resume the low +priority jobs. </LI> +</UL> </LI> <LI> <B>Priority</B>: Configure the partition's <I>Priority</I> setting relative to @@ -309,6 +313,6 @@ It would also necessitate major changes to the shadow bitmap data structures which are currently used. </P> -<p style="text-align:center;">Last modified 14 July 2009</p> +<p style="text-align:center;">Last modified 20 July 2009</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index e84dcf04050..9e468ace43b 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -720,8 +720,8 @@ on SPANK plugins, see the \fBspank\fR(8) manual. Controls the mechanism used to preempt lower priority jobs in order to execute higher priority jobs. For gang scheduling of jobs having the same priority, job suspend will -always be used and \fBPreemptMode\fR must be set to \fIKILL\fR or \fISUSPEND\fR -(anything other than OFF). +always be used and \fBPreemptMode\fR must be set to \fICANCEL\fR, +\fIREQUEUE\fR or \fISUSPEND\fR (anything other than \fIOFF\fR). Two different mechanisms can be used to determine when preeption can take place. SLURM partitions can have different \fBPriority\fR values and contain @@ -740,11 +740,14 @@ This is the only option compatible with \fBSchedulerType=sched/wiki\fR or \fBSchedulerType=sched/wiki2\fR (used by Maui and Moab respecitvely, which provide their own job preemption functionality). .TP +\fBCANCEL\fR +always cancel the job. +.TP \fBCHECKPOINT\fR -preempts jobs by checkpointing them (if possible) or killing them. +preempts jobs by checkpointing them (if possible) or cancelling them. .TP \fBREQUEUE\fR -preempts jobs by requeuing them (if possible) or killing them. +preempts jobs by requeuing them (if possible) or cancelling them. .TP \fBSUSPEND\fR preempts jobs by suspending them. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index a7d731e3d5b..1a6795fbf70 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1109,6 +1109,7 @@ typedef struct reservation_name_msg { #define PREEMPT_MODE_REQUEUE 0x0002 /* requeue or kill jobs to preempt */ #define PREEMPT_MODE_CHECKPOINT 0x0003 /* checkpoint job to preempt, * no automatic restart */ +#define PREEMPT_MODE_CANCEL 0x0004 /* always cancel the job */ typedef struct slurm_ctl_conf { time_t last_update; /* last update time of the build parameters */ diff --git a/src/api/config_info.c b/src/api/config_info.c index 8245f212951..1da7e62df9f 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -301,12 +301,14 @@ void slurm_print_ctl_conf ( FILE* out, if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_OFF) fprintf(out, "PreemptMode = OFF\n"); - else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_SUSPEND) - fprintf(out, "PreemptMode = SUSPEND\n"); + else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_CANCEL) + fprintf(out, "PreemptMode = CANCEL\n"); else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_CHECKPOINT) fprintf(out, "PreemptMode = CHECKPOINT\n"); else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_REQUEUE) fprintf(out, "PreemptMode = REQUEUE\n"); + else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_SUSPEND) + fprintf(out, "PreemptMode = SUSPEND\n"); if (strcmp(slurm_ctl_conf_ptr->priority_type, "priority/basic") == 0) { fprintf(out, "PriorityType = %s\n", diff --git a/src/common/read_config.c b/src/common/read_config.c index d8bc858d00b..f56180da69d 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -2121,14 +2121,16 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) if (s_p_get_string(&temp_str, "PreemptMode", hashtbl)) { if (strcasecmp(temp_str, "off") == 0) conf->preempt_mode = PREEMPT_MODE_OFF; + else if (strcasecmp(temp_str, "cancel") == 0) + conf->preempt_mode = PREEMPT_MODE_CANCEL; + else if (strcasecmp(temp_str, "checkpoint") == 0) + conf->preempt_mode = PREEMPT_MODE_CHECKPOINT; + else if (strcasecmp(temp_str, "requeue") == 0) + conf->preempt_mode = PREEMPT_MODE_REQUEUE; else if ((strcasecmp(temp_str, "on") == 0) || (strcasecmp(temp_str, "suspend") == 0)) conf->preempt_mode = PREEMPT_MODE_SUSPEND; - else if ((strcasecmp(temp_str, "kill") == 0) || - (strcasecmp(temp_str, "requeue") == 0)) - conf->preempt_mode = PREEMPT_MODE_REQUEUE; - else if (strcasecmp(temp_str, "checkpoint") == 0) - conf->preempt_mode = PREEMPT_MODE_CHECKPOINT; + else fatal("Invalid PreemptMode: %s", temp_str); } diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index d929bb08b20..f955843e47c 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -255,7 +255,8 @@ extern bool cr_preemption_enabled(void) uint16_t mode = slurm_get_preempt_mode(); if (mode == PREEMPT_MODE_SUSPEND) job_preemption_enabled = true; - else if ((mode == PREEMPT_MODE_CHECKPOINT) || + else if ((mode == PREEMPT_MODE_CANCEL) || + (mode == PREEMPT_MODE_CHECKPOINT) || (mode == PREEMPT_MODE_REQUEUE)) { job_preemption_enabled = true; job_preemption_killing = true; diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 5f745b325b4..83ab35d3714 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -279,7 +279,8 @@ static inline bool _job_preemption_enabled(void) uint16_t mode = slurm_get_preempt_mode(); if (mode == PREEMPT_MODE_SUSPEND) job_preemption_enabled = true; - else if ((mode == PREEMPT_MODE_CHECKPOINT) || + else if ((mode == PREEMPT_MODE_CANCEL) || + (mode == PREEMPT_MODE_CHECKPOINT) || (mode == PREEMPT_MODE_REQUEUE)) { job_preemption_enabled = true; job_preemption_killing = true; diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c index b1fc03959c6..d85f47a2cbe 100644 --- a/src/slurmctld/gang.c +++ b/src/slurmctld/gang.c @@ -672,6 +672,16 @@ static void _resume_job(uint32_t job_id) } } +static int _cancel_job(uint32_t job_id) +{ + int rc; + + rc = job_signal(job_id, SIGKILL, 0, 0); + if (rc == SLURM_SUCCESS) + info("gang: preempted job %u has been killed", job_id); + + return rc; +} static int _checkpoint_job(uint32_t job_id) { int rc; @@ -731,6 +741,8 @@ static void _preempt_job_dequeue(void) (void) _suspend_job(job_id); else if (preempt_mode == PREEMPT_MODE_REQUEUE) rc = _requeue_job(job_id); + else if (preempt_mode == PREEMPT_MODE_CANCEL) + rc = _cancel_job(job_id); else if (preempt_mode == PREEMPT_MODE_CHECKPOINT) rc = _checkpoint_job(job_id); else @@ -739,7 +751,7 @@ static void _preempt_job_dequeue(void) if (rc != SLURM_SUCCESS) { rc = job_signal(job_id, SIGKILL, 0, 0); if (rc == SLURM_SUCCESS) - info("gang: preempted job %u had to be killed", + info("gang: preempted job %u had to be killed", job_id); else { info("gang: preempted job %u kill failure %s", -- GitLab