From b1d827eab9b756b80963b3841ea21d9b17dd5fc0 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 4 Jul 2003 16:46:52 +0000 Subject: [PATCH] Add WaitTime, MaxJobCount and MinJobAge configuration parameters. WaitTime sets srun's default value for --wait. MaxJobCount sets the maximum job count for slurmctld (replacing #define MAX_JOB_CNT). MinJobAge sets the minimum job purrge age for slurmctld (replacing #define MIN_JOB_AGE). --- src/common/read_config.c | 30 +++++++++++++++++++++++++++++- src/common/slurm_protocol_api.c | 13 +++++++++++++ src/common/slurm_protocol_api.h | 6 ++++++ src/common/slurm_protocol_pack.c | 6 ++++++ src/slurmctld/controller.c | 3 +++ src/slurmctld/job_mgr.c | 15 +++++++++------ src/slurmctld/read_config.c | 27 ++++++++++++++++++--------- src/slurmctld/slurmctld.h | 22 ++++++++++++---------- src/srun/opt.c | 3 ++- 9 files changed, 98 insertions(+), 27 deletions(-) diff --git a/src/common/read_config.c b/src/common/read_config.c index a521223ecd0..52c5af72abd 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -112,6 +112,8 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->job_credential_private_key); xfree (ctl_conf_ptr->job_credential_public_certificate); ctl_conf_ptr->kill_wait = (uint16_t) NO_VAL; + ctl_conf_ptr->max_job_cnt = (uint16_t) NO_VAL; + ctl_conf_ptr->min_job_age = (uint16_t) NO_VAL; xfree (ctl_conf_ptr->plugindir); xfree (ctl_conf_ptr->prioritize); xfree (ctl_conf_ptr->prolog); @@ -133,6 +135,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->plugindir); xfree (ctl_conf_ptr->authtype ); xfree (ctl_conf_ptr->tmp_fs); + ctl_conf_ptr->wait_time = (uint16_t) NO_VAL; return; } @@ -159,6 +162,7 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) int inactive_limit = -1, kill_wait = -1; int ret2service = -1, slurmctld_timeout = -1, slurmd_timeout = -1; int slurmctld_debug = -1, slurmd_debug = -1; + int max_job_cnt = -1, min_job_age = -1, wait_time = -1; char *backup_addr = NULL, *backup_controller = NULL; char *control_addr = NULL, *control_machine = NULL, *epilog = NULL; char *prioritize = NULL, *prolog = NULL; @@ -189,6 +193,8 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) "JobCredentialPublicCertificate=", 's', &job_credential_public_certificate, "KillWait=", 'd', &kill_wait, + "MaxJobCount=", 'd', &max_job_cnt, + "MinJobAge=", 'd', &min_job_age, "PluginDir=", 's', &plugindir, "Prioritize=", 's', &prioritize, "Prolog=", 's', &prolog, @@ -207,6 +213,7 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) "SlurmdTimeout=", 'd', &slurmd_timeout, "StateSaveLocation=", 's', &state_save_location, "TmpFS=", 's', &tmp_fs, + "WaitTime=", 'd', &wait_time, "END"); if (error_code) @@ -296,6 +303,18 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) ctl_conf_ptr->kill_wait = kill_wait; } + if ( max_job_cnt != -1) { + if ( ctl_conf_ptr->max_job_cnt != (uint16_t) NO_VAL) + error (MULTIPLE_VALUE_MSG, "MaxJobCount"); + ctl_conf_ptr->max_job_cnt = max_job_cnt; + } + + if ( min_job_age != -1) { + if ( ctl_conf_ptr->min_job_age != (uint16_t) NO_VAL) + error (MULTIPLE_VALUE_MSG, "MinJobAge"); + ctl_conf_ptr->min_job_age = min_job_age; + } + if ( plugindir ) { if ( ctl_conf_ptr->plugindir ) { error( MULTIPLE_VALUE_MSG, "PluginDir" ); @@ -449,6 +468,12 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) ctl_conf_ptr->tmp_fs = tmp_fs; } + if ( wait_time != -1) { + if ( ctl_conf_ptr->wait_time != (uint16_t) NO_VAL) + error (MULTIPLE_VALUE_MSG, "WaitTime"); + ctl_conf_ptr->wait_time = wait_time; + } + if ( job_credential_private_key ) { if ( ctl_conf_ptr->job_credential_private_key ) { error (MULTIPLE_VALUE_MSG, "JobCredentialPrivateKey"); @@ -747,7 +772,10 @@ validate_config (slurm_ctl_conf_t *ctl_conf_ptr) _normalize_debug_level(&ctl_conf_ptr->slurmd_debug); if (ctl_conf_ptr->kill_wait == (uint16_t) NO_VAL) - ctl_conf_ptr->kill_wait = 30; + ctl_conf_ptr->kill_wait = DEFAULT_KILL_WAIT; + + if (ctl_conf_ptr->wait_time == (uint16_t) NO_VAL) + ctl_conf_ptr->wait_time = DEFAULT_WAIT_TIME; } /* Normalize supplied debug level to be in range per log.h definitions */ diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index c22585e5d38..4632ded8f70 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -105,6 +105,7 @@ int slurm_api_set_default_config() goto cleanup; read_slurm_conf_ctl(&slurmctld_conf); + if ((slurmctld_conf.control_addr == NULL) || (slurmctld_conf.slurmctld_port == 0)) { error("Unable to establish control machine or port"); @@ -157,6 +158,18 @@ uint32_t slurm_get_slurm_user_id(void) return slurmctld_conf.slurm_user_id; } +/* slurm_get_wait_time + * returns wait_time from slurmctld_conf object + * RET uint16_t - wait_time + */ +uint16_t slurm_get_wait_time(void) +{ + if (slurmctld_conf.slurmd_port == 0) /* ==0 if config unread */ + slurm_api_set_default_config(); + + return slurmctld_conf.wait_time; +} + /**********************************************************************\ * general message management functions used by slurmctld, slurmd \**********************************************************************/ diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 6406bcb845c..805eb9fd2a6 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -99,6 +99,12 @@ short int inline slurm_get_slurmd_port(void); */ uint32_t slurm_get_slurm_user_id(void); +/* slurm_get_wait_time + * returns wait_time from slurmctld_conf object + * RET uint16_t - wait_time + */ +uint16_t slurm_get_wait_time(void); + /**********************************************************************\ * general message management functions used by slurmctld, slurmd \**********************************************************************/ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index cd864d77ef8..a71d220a489 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1491,6 +1491,8 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) pack16(build_ptr->heartbeat_interval, buffer); pack16(build_ptr->inactive_limit, buffer); pack16(build_ptr->kill_wait, buffer); + pack16(build_ptr->max_job_cnt, buffer); + pack16(build_ptr->min_job_age, buffer); packstr(build_ptr->plugindir, buffer); packstr(build_ptr->prioritize, buffer); packstr(build_ptr->prolog, buffer); @@ -1511,6 +1513,7 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) packstr(build_ptr->slurm_conf, buffer); packstr(build_ptr->state_save_location, buffer); packstr(build_ptr->tmp_fs, buffer); + pack16(build_ptr->wait_time, buffer); packstr(build_ptr->job_credential_private_key, buffer); packstr(build_ptr->job_credential_public_certificate, buffer); } @@ -1543,6 +1546,8 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** safe_unpack16(&build_ptr->heartbeat_interval, buffer); safe_unpack16(&build_ptr->inactive_limit, buffer); safe_unpack16(&build_ptr->kill_wait, buffer); + safe_unpack16(&build_ptr->max_job_cnt, buffer); + safe_unpack16(&build_ptr->min_job_age, buffer); safe_unpackstr_xmalloc(&build_ptr->plugindir, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&build_ptr->prioritize, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&build_ptr->prolog, &uint16_tmp, buffer); @@ -1570,6 +1575,7 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** safe_unpackstr_xmalloc(&build_ptr->state_save_location, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&build_ptr->tmp_fs, &uint16_tmp, buffer); + safe_unpack16(&build_ptr->wait_time, buffer); safe_unpackstr_xmalloc(&build_ptr->job_credential_private_key, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&build_ptr-> diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 8b06b68e8b0..4cbe0c9946a 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -2058,6 +2058,8 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->job_credential_public_certificate = slurmctld_conf.job_credential_public_certificate; conf_ptr->kill_wait = slurmctld_conf.kill_wait; + conf_ptr->max_job_cnt = slurmctld_conf.max_job_cnt; + conf_ptr->min_job_age = slurmctld_conf.min_job_age; conf_ptr->plugindir = slurmctld_conf.plugindir; conf_ptr->prioritize = slurmctld_conf.prioritize; conf_ptr->prolog = slurmctld_conf.prolog; @@ -2078,6 +2080,7 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->slurm_conf = slurmctld_conf.slurm_conf; conf_ptr->state_save_location = slurmctld_conf.state_save_location; conf_ptr->tmp_fs = slurmctld_conf.tmp_fs; + conf_ptr->wait_time = slurmctld_conf.wait_time; return; } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index f3726236bd6..f51c4a625ba 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -69,7 +69,7 @@ #define STEP_FLAG 0xbbbb #define TOP_PRIORITY 0xffff0000 /* large, but leave headroom for higher */ -#define JOB_HASH_INX(_job_id) (_job_id % MAX_JOB_COUNT) +#define JOB_HASH_INX(_job_id) (_job_id % DEFAULT_MAX_JOB_COUNT) #define YES_OR_NO(_in_string) \ (( strcmp ((_in_string),"YES"))? \ @@ -83,8 +83,8 @@ time_t last_job_update; /* time of last update to job records */ static int default_prio = TOP_PRIORITY; static int job_count; /* job's in the system */ static long job_id_sequence = -1; /* first job_id to assign new job */ -static struct job_record *job_hash[MAX_JOB_COUNT]; -static struct job_record *job_hash_over[MAX_JOB_COUNT]; +static struct job_record *job_hash[DEFAULT_MAX_JOB_COUNT]; +static struct job_record *job_hash_over[DEFAULT_MAX_JOB_COUNT]; static int max_hash_over = 0; /* Local functions */ @@ -154,7 +154,7 @@ struct job_record *create_job_record(int *error_code) struct job_record *job_record_point; struct job_details *job_details_point; - if (job_count >= MAX_JOB_COUNT) { + if (job_count >= DEFAULT_MAX_JOB_COUNT) { error("create_job_record: job_count exceeds limit"); *error_code = EAGAIN; return NULL; @@ -770,7 +770,7 @@ void _add_job_hash(struct job_record *job_ptr) inx = JOB_HASH_INX(job_ptr->job_id); if (job_hash[inx]) { - if (max_hash_over >= MAX_JOB_COUNT) + if (max_hash_over >= DEFAULT_MAX_JOB_COUNT) fatal("Job hash table overflow"); job_hash_over[max_hash_over++] = job_ptr; } else @@ -2064,9 +2064,12 @@ static int _list_find_job_id(void *job_entry, void *key) */ static int _list_find_job_old(void *job_entry, void *key) { - time_t min_age = time(NULL) - MIN_JOB_AGE; + time_t min_age = time(NULL) - slurmctld_conf.min_job_age; struct job_record *job_ptr = (struct job_record *)job_entry; + if (slurmctld_conf.min_job_age == 0) + return 0; /* No job record purging */ + if (job_ptr->end_time > min_age) return 0; /* Too new to purge */ diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 3cc4b36b01c..474f4c5c992 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -813,31 +813,37 @@ static void _set_config_defaults(slurm_ctl_conf_t * ctl_conf_ptr) "read_slurm_conf: backup_controller value not specified."); if (ctl_conf_ptr->fast_schedule == (uint16_t) NO_VAL) - ctl_conf_ptr->fast_schedule = 1; + ctl_conf_ptr->fast_schedule = DEFAULT_FAST_SCHEDULE; if (ctl_conf_ptr->first_job_id == (uint32_t) NO_VAL) - ctl_conf_ptr->first_job_id = 1; + ctl_conf_ptr->first_job_id = DEFAULT_FIRST_JOB_ID; if (ctl_conf_ptr->hash_base == (uint16_t) NO_VAL) - ctl_conf_ptr->hash_base = 10; + ctl_conf_ptr->hash_base = DEFAULT_HASH_BASE; if (ctl_conf_ptr->heartbeat_interval == (uint16_t) NO_VAL) - ctl_conf_ptr->heartbeat_interval = 60; + ctl_conf_ptr->heartbeat_interval = DEFAULT_HEARTBEAT_INTERVAL; if (ctl_conf_ptr->inactive_limit == (uint16_t) NO_VAL) - ctl_conf_ptr->inactive_limit = 0; /* unlimited */ + ctl_conf_ptr->inactive_limit = DEFAULT_INACTIVE_LIMIT; if (ctl_conf_ptr->kill_wait == (uint16_t) NO_VAL) - ctl_conf_ptr->kill_wait = 30; + ctl_conf_ptr->kill_wait = DEFAULT_KILL_WAIT; + + if (ctl_conf_ptr->max_job_cnt == (uint16_t) NO_VAL) + ctl_conf_ptr->max_job_cnt = DEFAULT_MAX_JOB_COUNT; + + if (ctl_conf_ptr->min_job_age == (uint16_t) NO_VAL) + ctl_conf_ptr->min_job_age = DEFAULT_MIN_JOB_AGE; if (ctl_conf_ptr->ret2service == (uint16_t) NO_VAL) - ctl_conf_ptr->ret2service = 0; + ctl_conf_ptr->ret2service = DEFAULT_RETURN_TO_SERVICE; if (ctl_conf_ptr->slurmctld_timeout == (uint16_t) NO_VAL) - ctl_conf_ptr->slurmctld_timeout = 300; + ctl_conf_ptr->slurmctld_timeout = DEFAULT_SLURMCTLD_TIMEOUT; if (ctl_conf_ptr->slurmd_timeout == (uint16_t) NO_VAL) - ctl_conf_ptr->slurmd_timeout = 300; + ctl_conf_ptr->slurmd_timeout = DEFAULT_SLURMD_TIMEOUT; if (ctl_conf_ptr->state_save_location == NULL) ctl_conf_ptr->state_save_location = @@ -845,6 +851,9 @@ static void _set_config_defaults(slurm_ctl_conf_t * ctl_conf_ptr) if (ctl_conf_ptr->tmp_fs == NULL) ctl_conf_ptr->tmp_fs = xstrdup(DEFAULT_TMP_FS); + + if (ctl_conf_ptr->wait_time == (uint16_t) NO_VAL) + ctl_conf_ptr->wait_time = DEFAULT_WAIT_TIME; } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index fefe3141c82..2f527cc3ab2 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -96,16 +96,18 @@ * Update the group uid_t access list as needed */ #define PERIODIC_GROUP_CHECK 600 -/* Default temporary storage for slurm state and user files */ -#define DEFAULT_TMP_FS "/tmp" - -/* Don't accept more jobs once there are MAX_JOB_COUNT in the system - * This should prevent exhausting memory */ -#define MAX_JOB_COUNT 2000 - -/* Purge OK for jobs over MIN_JOB_AGE seconds old (since completion) - * This should prevent exhausting memory */ -#define MIN_JOB_AGE 300 +/* Default configuration configuration file values */ +#define DEFAULT_FAST_SCHEDULE 1 +#define DEFAULT_FIRST_JOB_ID 1 +#define DEFAULT_HASH_BASE 10 +#define DEFAULT_HEARTBEAT_INTERVAL 60 +#define DEFAULT_INACTIVE_LIMIT 0 +#define DEFAULT_MAX_JOB_COUNT 2000 +#define DEFAULT_MIN_JOB_AGE 300 +#define DEFAULT_RETURN_TO_SERVICE 0 +#define DEFAULT_SLURMCTLD_TIMEOUT 300 +#define DEFAULT_SLURMD_TIMEOUT 300 +#define DEFAULT_TMP_FS "/tmp" extern slurm_ctl_conf_t slurmctld_conf; diff --git a/src/srun/opt.c b/src/srun/opt.c index c923b63cd35..c1e7bf11aa0 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -46,6 +46,7 @@ #include "src/common/list.h" #include "src/common/log.h" +#include "src/common/slurm_protocol_api.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" @@ -584,7 +585,7 @@ static void _opt_default() opt.allocate = false; opt.attach = NULL; opt.join = false; - opt.max_wait = 0; + opt.max_wait = slurm_get_wait_time(); _verbose = 0; opt.slurmd_debug = LOG_LEVEL_QUIET; -- GitLab