diff --git a/NEWS b/NEWS index 91263d13e49d9ce364dcfe122f679ce9b0eee2ec..6a70c6b987e74cfbcc8f9a87cd025e091faa705d 100644 --- a/NEWS +++ b/NEWS @@ -111,6 +111,9 @@ documents those changes that are of interest to users and admins. together across clusters or within clusters that are not related. Use the --wckey option in srun, sbatch or salloc or set the SLURM_WCKEY env var to have this set. Use sreport with the wckey option to view reports. + -- Added configuration parameter BatchStartTimeout to control how long to + allow for a batch job prolog and environment loading (for Moab) to run. + See "man slurm.conf" for details. * Changes in SLURM 1.3.11 ========================= diff --git a/doc/html/configurator.html.in b/doc/html/configurator.html.in index 6ec53b9c407183cd3a971544174396e21e4f59a0..41cdf10e56b9bcf51600988063c01fed0f5c382d 100644 --- a/doc/html/configurator.html.in +++ b/doc/html/configurator.html.in @@ -191,6 +191,7 @@ function displayfile() "# <br>" + "# <br>" + "# TIMERS <br>" + + "#BatchStartTimeout=10 <br>" + "#EpilogMsgTime=2000 <br>" + "#GetEnvTimeout=2 <br>" + "#HealthCheckInterval=0 <br>" + diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 2707e0dfe6cd457e4996fcac39542065e386866d..1f5bddd448c496886df7e604acfa8cf919ee143d 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -156,6 +156,17 @@ This should be a node name without the full domain name (e.g. "lx0002"). While not essential, it is recommended that you specify a backup controller. See the \fBRELOCATING CONTROLLERS\fR section if you change this. +.TP +\fBBatchStartTimeout\fR +The maximum time (in seconds) that a batch job is permitted for +launching before being considered missing and releasing the +allocation. The default value is 10 (seconds). Larger values may +be required if more time is required to execute the \fBProlog\fR, +loading user environment variables (for Moab spawned jobs), or the +slurmd daemon gets paged from memory. +NOTE: The value will not be reported by "scontrol show config" command +until SLURM version 1.4. + .TP \fBCacheGroups\fR If set to 1, the slurmd daemon will cache /etc/groups entries. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 3893ae9a8b59f66c265ea173f153ba606e6023f3..bd6814601b0a30274e925bb14e39285411eedbe1 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -973,6 +973,7 @@ typedef struct slurm_ctl_conf { char *authtype; /* authentication type */ char *backup_addr; /* comm path of slurmctld secondary server */ char *backup_controller;/* name of slurmctld secondary server */ + uint16_t batch_start_timeout; /* max secs for batch job to start */ time_t boot_time; /* time slurmctld last booted */ uint16_t cache_groups; /* cache /etc/groups to avoid initgroups(2) */ char *checkpoint_type; /* checkpoint plugin type */ diff --git a/src/common/read_config.c b/src/common/read_config.c index c12b5be689402bdf0b8f3b5a09ff29d6d5908d06..dfd2b10a9c1deefb8e51d0fcf9ca3cbc940bd31e 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -135,6 +135,7 @@ s_p_options_t slurm_conf_options[] = { {"AuthType", S_P_STRING}, {"BackupAddr", S_P_STRING}, {"BackupController", S_P_STRING}, + {"BatchStartTimeout", S_P_UINT16}, {"CheckpointType", S_P_STRING}, {"CacheGroups", S_P_UINT16}, {"ClusterName", S_P_STRING}, @@ -1231,6 +1232,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->authtype); xfree (ctl_conf_ptr->backup_addr); xfree (ctl_conf_ptr->backup_controller); + ctl_conf_ptr->batch_start_timeout = 0; ctl_conf_ptr->cache_groups = 0; xfree (ctl_conf_ptr->checkpoint_type); xfree (ctl_conf_ptr->cluster_name); @@ -1546,6 +1548,10 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) conf->backup_addr = xstrdup(conf->backup_controller); } + if (!s_p_get_uint16(&conf->batch_start_timeout, "BatchStartTimeout", + hashtbl)) + conf->batch_start_timeout = DEFAULT_BATCH_START_TIMEOUT; + s_p_get_string(&conf->cluster_name, "ClusterName", hashtbl); if (!s_p_get_uint16(&conf->complete_wait, "CompleteWait", hashtbl)) diff --git a/src/common/read_config.h b/src/common/read_config.h index cdf9b2701c9d4d47243737375e4ff876f4acdcf2..ae5e4583e3646de5daf266cd50af0226cc7d033e 100644 --- a/src/common/read_config.h +++ b/src/common/read_config.h @@ -59,6 +59,7 @@ enum { #define DEFAULT_ACCOUNTING_ENFORCE ACCOUNTING_ENFORCE_NONE #define DEFAULT_ACCOUNTING_STORAGE_TYPE "accounting_storage/none" #define DEFAULT_AUTH_TYPE "auth/munge" +#define DEFAULT_BATCH_START_TIMEOUT 10 #define DEFAULT_CACHE_GROUPS 0 #define DEFAULT_COMPLETE_WAIT 0 #define DEFAULT_CRYPTO_TYPE "crypto/munge" diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 3d524f427f7cdabaf4cb8a9c01297a1639d17fdc..5a3847c874736fb3c4e3a4056458ff9774674d24 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -114,7 +114,7 @@ int slurm_set_api_config(slurm_protocol_config_t * protocol_conf) * returns a pointer to the current slurm_protocol_config object * RET slurm_protocol_config_t - current slurm_protocol_config object */ -slurm_protocol_config_t *slurm_get_api_config() +slurm_protocol_config_t *slurm_get_api_config(void) { return proto_conf; } @@ -134,7 +134,7 @@ extern void slurm_api_set_conf_file(char *pathname) * the compiled in default slurm_protocol_config object is initialized * RET int - return code */ -int slurm_api_set_default_config() +int slurm_api_set_default_config(void) { int rc = SLURM_SUCCESS; slurm_ctl_conf_t *conf; @@ -206,6 +206,23 @@ uint16_t slurm_get_complete_wait(void) /* slurm_mutex_lock(&config_lock); */ /* } */ +/* slurm_get_batch_start_timeout + * RET BatchStartTimeout value from slurm.conf + */ +uint16_t slurm_get_batch_start_timeout(void) +{ + uint16_t batch_start_timeout = 0; + slurm_ctl_conf_t *conf; + + if(slurmdbd_conf) { + } else { + conf = slurm_conf_lock(); + batch_start_timeout = conf->batch_start_timeout; + slurm_conf_unlock(); + } + return batch_start_timeout; +} + /* slurm_get_def_mem_per_task * RET DefMemPerTask value from slurm.conf */ diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 5086125f26d2f51cd473a94b0502794b4ed05e3b..62274160f5483766fa82ea4d91db94b3dbb93529 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -95,7 +95,12 @@ int inline slurm_set_api_config(slurm_protocol_config_t * protocol_conf); * returns a pointer to the current slurm_protocol_config object * RET slurm_protocol_config_t - current slurm_protocol_config object */ -inline slurm_protocol_config_t *slurm_get_api_config(); +inline slurm_protocol_config_t *slurm_get_api_config(void); + +/* slurm_get_batch_start_timeout + * RET BatchStartTimeout value from slurm.conf + */ +uint16_t slurm_get_batch_start_timeout(void); /* slurm_get_complete_wait * RET CompleteWait value from slurm.conf diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 0472ed2f06a06ca90d884afd7e7d8acc79cc5fb3..7b8f28581b8e8be2dc91dffba33f77545c04ec45 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -81,7 +81,6 @@ #include "src/slurmctld/srun_comm.h" #include "src/slurmctld/trigger_mgr.h" -#define BATCH_JOB_LAUNCH_TIME 1200 /* seconds for prolog & env var load */ #define DETAILS_FLAG 0xdddd #define MAX_RETRIES 10 #define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0 @@ -4929,7 +4928,7 @@ static void _purge_lost_batch_jobs(int node_inx, time_t now) { ListIterator job_iterator; struct job_record *job_ptr; - time_t recent = now - BATCH_JOB_LAUNCH_TIME; + time_t recent = now - slurm_get_batch_start_timeout(); job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) {