From cb7d8e30f3dae68d1355aa582bde654497fa076b Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 11 Aug 2009 19:18:39 +0000 Subject: [PATCH] Add "GANG" flag to PreemptMode in order to control whether or not gang scheduling is performed. --- NEWS | 5 ++- doc/html/gang_scheduling.shtml | 19 ++++---- doc/man/man5/slurm.conf.5 | 29 ++++++------ slurm/slurm.h.in | 1 + src/api/config_info.c | 43 +++++++++++++----- src/common/read_config.c | 44 +++++++++++++------ src/plugins/select/cons_res/select_cons_res.c | 1 + src/plugins/select/linear/select_linear.c | 1 + src/slurmctld/gang.c | 1 + src/slurmctld/read_config.c | 16 +++---- src/slurmctld/sched_plugin.c | 18 ++++---- 11 files changed, 111 insertions(+), 67 deletions(-) diff --git a/NEWS b/NEWS index 82ad136bcb5..2a7adbe1e2a 100644 --- a/NEWS +++ b/NEWS @@ -6,8 +6,9 @@ documents those changes that are of interest to users and admins. -- Removed sched/gang plugin and moved the logic directly into the slurmctld daemon so that job preemption and gang scheduling can be used with the sched/backfill plugin. Added configuration parameter: - PreemptMode=off|suspend|cancel|requeue|checkpoint - to enable/disable the job preemption logic (disabled by default). + PreemptMode=gang|off|suspend|cancel|requeue|checkpoint + to enable/disable gang scheduling and job preemption logic (both are + disabled by default). (NOTE: There are some problems with memory management which could prevent a job from starting when memory would be freed by a job being requeued or otherwise removed, these are being worked on) diff --git a/doc/html/gang_scheduling.shtml b/doc/html/gang_scheduling.shtml index 382a977d548..4722a80e2a4 100644 --- a/doc/html/gang_scheduling.shtml +++ b/doc/html/gang_scheduling.shtml @@ -79,8 +79,9 @@ memory limits, it will be canceled in order to prevent it from adversely effecting other jobs sharing the same resources. </LI> <LI> -<B>PreemptMode</B>: Configure to <I>SUSPEND</I> or <I>KILL</I> (any value -other than <I>OFF</I> enables gang scheduling). +<B>PreemptMode</B>: set the <I>GANG</I> option. +Additional options may be specified to enable job preemption in addition +to gang scheduling. </LI> <LI> <B>SchedulerTimeSlice</B>: The default timeslice interval is 30 seconds. @@ -163,16 +164,16 @@ PARTITION AVAIL TIMELIMIT NODES STATE NODELIST active* up infinite 5 idle n[12-16] </PRE> <P> -Here are the Scheduler settings (the last two settings are the relevant ones): +Here are the Scheduler settings (excerpt of output): </P> <PRE> -[user@n16 load]$ <B>scontrol show config | grep Sched</B> -FastSchedule = 1 -PreemptMode = SUSPEND -SchedulerPort = 7321 -SchedulerRootFilter = 1 +[user@n16 load]$ <B>scontrol show config</B> +... +PreemptMode = GANG +... SchedulerTimeSlice = 30 SchedulerType = sched/builtin +... </PRE> <P> The <I>myload</I> script launches a simple load-generating app that runs @@ -520,6 +521,6 @@ For now this idea could be experimented with by disabling memory support in the selector and submitting appropriately sized jobs. </P> -<p style="text-align:center;">Last modified 9 July 2009</p> +<p style="text-align:center;">Last modified 11 August 2009</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 9c4284101ed..17c0883a5f6 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1,4 +1,4 @@ -.TH "slurm.conf" "5" "July 2009" "slurm.conf 2.1" "Slurm configuration file" +.TH "slurm.conf" "5" "August 2009" "slurm.conf 2.1" "Slurm configuration file" .SH "NAME" slurm.conf \- Slurm configuration file @@ -733,21 +733,19 @@ on SPANK plugins, see the \fBspank\fR(8) manual. .TP \fBPreemptMode\fR -Controls the mechanism used to preempt lower priority jobs in order to -execute higher priority jobs. -For gang scheduling of jobs having the same priority, job suspend will -always be used and \fBPreemptMode\fR must be set to \fICANCEL\fR, -\fIREQUEUE\fR or \fISUSPEND\fR (anything other than \fIOFF\fR). -Two different mechanisms can be used to determine when preeption can -take place. +Enables gang scheduling and/or controls the mechanism used to preempt lower +priority jobs in order to execute higher priority jobs. +For gang scheduling (time slicing) of jobs in the same partition, +job suspend will always be used. SLURM partitions can have different \fBPriority\fR values and contain the same nodes, in which case jobs from the higher priority -partition will preempt jobs from the lower priority partition. -Alternately when SLURM is used with an accounting database, the jobs can be -submitted to the same partition but have different Quality Of Service (QOS) -values. -Jobs from a QOS with a descriptor containing \fIPREEMPTOR\fR are capable -of preempting jobs from a QOS with a descriptor containing \fIPREEMPTEE\fR. +partition can preempt jobs from the lower priority partition. +The \fBGANG\fR option can be specified in addition to a preemption +method specification with the two options comma separated. +NOTE: \fBGANG\fR must be specified for job preemption to occur. +Changes are underway to permit job preemption without gang scheduling +and job preemption based upon QOS (Quality Of Service) rather than +partition priority. .RS .TP 12 \fBOFF\fR @@ -762,6 +760,9 @@ always cancel the job. \fBCHECKPOINT\fR preempts jobs by checkpointing them (if possible) or cancelling them. .TP +\fBGANG\fR +enables gang scheduling (time slicing) of jobs in the same partition. +.TP \fBREQUEUE\fR preempts jobs by requeuing them (if possible) or cancelling them. .TP diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index f09bafc9356..d5f89258846 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1209,6 +1209,7 @@ typedef struct reservation_name_msg { #define PREEMPT_MODE_CHECKPOINT 0x0003 /* checkpoint job to preempt, * no automatic restart */ #define PREEMPT_MODE_CANCEL 0x0004 /* always cancel the job */ +#define PREEMPT_MODE_GANG 0x8000 /* enable gang scheduling */ typedef struct slurm_ctl_conf { time_t last_update; /* last update time of the build parameters */ diff --git a/src/api/config_info.c b/src/api/config_info.c index 1da7e62df9f..425a4f8f2a6 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -116,6 +116,37 @@ _reset_period_str(uint16_t reset_period) } } +static char * +_preempt_str(uint16_t preempt_mode) +{ + char *gang_str; + static char preempt_str[64]; + + if (preempt_mode == PREEMPT_MODE_OFF) + return "OFF"; + if (preempt_mode == PREEMPT_MODE_GANG) + return "GANG"; + + if (preempt_mode & PREEMPT_MODE_GANG) { + gang_str = "GANG,"; + preempt_mode &= (~PREEMPT_MODE_GANG); + } else + gang_str = ""; + + if (preempt_mode == PREEMPT_MODE_CANCEL) + sprintf(preempt_str, "%sCANCEL", gang_str); + else if (preempt_mode == PREEMPT_MODE_CHECKPOINT) + sprintf(preempt_str, "%sCHECKPOINT", gang_str); + else if (preempt_mode == PREEMPT_MODE_REQUEUE) + sprintf(preempt_str, "%sREQUEUE", gang_str); + else if (preempt_mode == PREEMPT_MODE_SUSPEND) + sprintf(preempt_str, "%sSUSPEND", gang_str); + else + sprintf(preempt_str, "%sUNKNOWN", gang_str); + + return preempt_str; +} + /* * slurm_print_ctl_conf - output the contents of slurm control configuration * message as loaded using slurm_load_ctl_conf @@ -299,16 +330,8 @@ void slurm_print_ctl_conf ( FILE* out, fprintf(out, "PlugStackConfig = %s\n", slurm_ctl_conf_ptr->plugstack); - if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_OFF) - fprintf(out, "PreemptMode = OFF\n"); - else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_CANCEL) - fprintf(out, "PreemptMode = CANCEL\n"); - else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_CHECKPOINT) - fprintf(out, "PreemptMode = CHECKPOINT\n"); - else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_REQUEUE) - fprintf(out, "PreemptMode = REQUEUE\n"); - else if (slurm_ctl_conf_ptr->preempt_mode == PREEMPT_MODE_SUSPEND) - fprintf(out, "PreemptMode = SUSPEND\n"); + fprintf(out, "PreemptMode = %s\n", + _preempt_str(slurm_ctl_conf_ptr->preempt_mode)); if (strcmp(slurm_ctl_conf_ptr->priority_type, "priority/basic") == 0) { fprintf(out, "PriorityType = %s\n", diff --git a/src/common/read_config.c b/src/common/read_config.c index f56180da69d..0b2577fa716 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -2119,20 +2119,36 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) conf->plugstack = xstrdup(default_plugstack); if (s_p_get_string(&temp_str, "PreemptMode", hashtbl)) { - if (strcasecmp(temp_str, "off") == 0) - conf->preempt_mode = PREEMPT_MODE_OFF; - else if (strcasecmp(temp_str, "cancel") == 0) - conf->preempt_mode = PREEMPT_MODE_CANCEL; - else if (strcasecmp(temp_str, "checkpoint") == 0) - conf->preempt_mode = PREEMPT_MODE_CHECKPOINT; - else if (strcasecmp(temp_str, "requeue") == 0) - conf->preempt_mode = PREEMPT_MODE_REQUEUE; - else if ((strcasecmp(temp_str, "on") == 0) || - (strcasecmp(temp_str, "suspend") == 0)) - conf->preempt_mode = PREEMPT_MODE_SUSPEND; - - else - fatal("Invalid PreemptMode: %s", temp_str); + int preempt_modes = 0; + char *last = NULL, *tok; + conf->preempt_mode = 0; + tok = strtok_r(temp_str, ",", &last); + while (tok) { + if (strcasecmp(tok, "gang") == 0) { + conf->preempt_mode |= PREEMPT_MODE_GANG; + } else if (strcasecmp(tok, "off") == 0) { + conf->preempt_mode += PREEMPT_MODE_OFF; + preempt_modes++; + } else if (strcasecmp(tok, "cancel") == 0) { + conf->preempt_mode += PREEMPT_MODE_CANCEL; + preempt_modes++; + } else if (strcasecmp(tok, "checkpoint") == 0) { + conf->preempt_mode += PREEMPT_MODE_CHECKPOINT; + preempt_modes++; + } else if (strcasecmp(tok, "requeue") == 0) { + conf->preempt_mode += PREEMPT_MODE_REQUEUE; + preempt_modes++; + } else if ((strcasecmp(tok, "on") == 0) || + (strcasecmp(tok, "suspend") == 0)) { + conf->preempt_mode += PREEMPT_MODE_SUSPEND; + preempt_modes++; + } else + fatal("Invalid PreemptMode: %s", tok); + tok = strtok_r(NULL, ",", &last); + } + xfree(temp_str); + if (preempt_modes > 1) + fatal("More than one PreemptMode specified"); } if (s_p_get_string(&temp_str, "PriorityDecayHalfLife", hashtbl)) { diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index f955843e47c..798294e5fb5 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -253,6 +253,7 @@ extern bool cr_preemption_enabled(void) { if (!job_preemption_tested) { uint16_t mode = slurm_get_preempt_mode(); + mode &= ~PREEMPT_MODE_GANG; if (mode == PREEMPT_MODE_SUSPEND) job_preemption_enabled = true; else if ((mode == PREEMPT_MODE_CANCEL) || diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 83ab35d3714..bef6c6006a5 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -277,6 +277,7 @@ static inline bool _job_preemption_enabled(void) { if (!job_preemption_tested) { uint16_t mode = slurm_get_preempt_mode(); + mode &= (~PREEMPT_MODE_GANG); if (mode == PREEMPT_MODE_SUSPEND) job_preemption_enabled = true; else if ((mode == PREEMPT_MODE_CANCEL) || diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c index d85f47a2cbe..58c3e411cb4 100644 --- a/src/slurmctld/gang.c +++ b/src/slurmctld/gang.c @@ -733,6 +733,7 @@ static void _preempt_job_dequeue(void) uint32_t job_id, *tmp_id; uint16_t preempt_mode = slurm_get_preempt_mode(); + preempt_mode &= (~PREEMPT_MODE_GANG); while ((tmp_id = list_pop(preempt_job_list))) { job_id = *tmp_id; xfree(tmp_id); diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 29d53abf2e3..2cb3bc7f888 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -1042,28 +1042,26 @@ static int _preserve_select_type_param(slurm_ctl_conf_t *ctl_conf_ptr, } /* Start or stop the gang scheduler module as needed based upon changes in - * job preemption support */ + * configuration */ static int _update_preempt(uint16_t old_preempt_mode) { uint16_t new_preempt_mode = slurm_get_preempt_mode(); - if ((old_preempt_mode == PREEMPT_MODE_OFF) == - (new_preempt_mode == PREEMPT_MODE_OFF)) + if ((old_preempt_mode & PREEMPT_MODE_GANG) == + (new_preempt_mode & PREEMPT_MODE_GANG)) return SLURM_SUCCESS; - if (old_preempt_mode == PREEMPT_MODE_OFF) { - info("Enabling job preemption and gang scheduling"); + if (new_preempt_mode & PREEMPT_MODE_GANG) { + info("Enabling gang scheduling"); return gs_init(); } - if (new_preempt_mode == PREEMPT_MODE_OFF) { - info("Disabling job preemption and gang scheduling"); + if (old_preempt_mode == PREEMPT_MODE_GANG) { + info("Disabling gang scheduling"); gs_wake_jobs(); return gs_fini(); } - error("Invalid value for EnablePreemption (old:%u new:%u)", - old_preempt_mode, new_preempt_mode); return EINVAL; } diff --git a/src/slurmctld/sched_plugin.c b/src/slurmctld/sched_plugin.c index 56df2f74a72..e924a8adc51 100644 --- a/src/slurmctld/sched_plugin.c +++ b/src/slurmctld/sched_plugin.c @@ -17,7 +17,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -240,7 +240,7 @@ slurm_sched_init( void ) goto done; } - if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && + if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_init() != SLURM_SUCCESS)) error( "cannot start gang scheduler "); @@ -264,7 +264,7 @@ slurm_sched_fini( void ) rc = slurm_sched_context_destroy(g_sched_context); g_sched_context = NULL; - if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && + if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_fini() != SLURM_SUCCESS)) error( "cannot stop gang scheduler" ); @@ -281,7 +281,7 @@ slurm_sched_reconfig( void ) if ( slurm_sched_init() < 0 ) return SLURM_ERROR; - if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && + if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_reconfig() != SLURM_SUCCESS)) error( "cannot reconfigure gang scheduler" ); @@ -299,7 +299,7 @@ slurm_sched_schedule( void ) #if 0 /* synchronize job listings? Here? */ - if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && + if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_job_scan() != SLURM_SUCCESS)) error( "gang scheduler could not rescan jobs" ); #endif @@ -316,7 +316,7 @@ slurm_sched_newalloc( struct job_record *job_ptr ) if ( slurm_sched_init() < 0 ) return SLURM_ERROR; - if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && + if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_job_start( job_ptr ) != SLURM_SUCCESS)) { error( "gang scheduler problem starting job %u", job_ptr->job_id); @@ -334,7 +334,7 @@ slurm_sched_freealloc( struct job_record *job_ptr ) if ( slurm_sched_init() < 0 ) return SLURM_ERROR; - if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && + if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_job_fini( job_ptr ) != SLURM_SUCCESS)) { error( "gang scheduler problem finishing job %u", job_ptr->job_id); @@ -367,7 +367,7 @@ slurm_sched_job_is_pending( void ) if ( slurm_sched_init() < 0 ) return; - if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && + if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_reconfig() != SLURM_SUCCESS)) error( "cannot reconfigure gang scheduler" ); @@ -385,7 +385,7 @@ slurm_sched_partition_change( void ) #if 0 /* synchronize job listings? Here? */ - if ( (slurm_get_preempt_mode() != PREEMPT_MODE_OFF) && + if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_job_scan() != SLURM_SUCCESS)) error( "gang scheduler could not rescan jobs" ); #endif -- GitLab