diff --git a/NEWS b/NEWS index 023077e46849491a962fb5b882f683bb47baa7ad..6d2543b6dc2395f8bd5ad9db4bd00e7597537a1b 100644 --- a/NEWS +++ b/NEWS @@ -23,6 +23,7 @@ documents those changes that are of interest to users and admins. for a partition, this plugin will gang-schedule or "timeslice" jobs that share common resources within the partition. Note that resources that are shared across partitions are not gang-scheduled. + -- Add EpilogMsgTime configuration parameter. See "man slurm.conf" for details. * Changes in SLURM 1.3.0-pre9 ============================= diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 572f160303d4d962b0777273b0cdc1953bb99989..47c29e3d7d11a219bd9854b09b25c28c8cb22e64 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -75,6 +75,11 @@ CONFIGURATION FILE CHANGES upon two factors. First the priority of its partition and the job's priority. Since nodes can be configured in multiple partitions, this can be used to configure high priority partitions (queues). +* Added new parameters "HealthCheckInterval" and "HealthCheckProgram" to automatically + test the health of compute nodes. +* Added "EpilogMsgTime" to prevent a burst of EpilogComplete messages from being + sent at the same time from many slurmd daemons to slurmctld. This should help + prevent lost messages and improve throughput for large jobs. * The partition parameter "Shared" now has a job count. For example: Shared=YES:4 (Up to 4 jobs may share each resource) Shared=FORCE:2 (Up to 2 jobs must share each resource) @@ -82,6 +87,7 @@ CONFIGURATION FILE CHANGES hours:minutes:seconds, days-hours, days-hours:minutes, days-hours:minutes:seconds or "UNLIMITED". * Checkpoint plugin added for XLCH. +* Added sched/gang plugin for time-slicing of parallel jobs. * See "man slurm.conf" for more information. OTHER CHANGES diff --git a/doc/html/configurator.html.in b/doc/html/configurator.html.in index 143cd82780d37da8165f755734df219940d064f2..fcb42d95841ea62260d91a9d8f73008236f244d1 100644 --- a/doc/html/configurator.html.in +++ b/doc/html/configurator.html.in @@ -183,6 +183,7 @@ function displayfile() "InactiveLimit=" + document.config.inactive_limit.value + "<br>" + "MinJobAge=" + document.config.min_job_age.value + "<br>" + "KillWait=" + document.config.kill_wait.value + "<br>" + + "#EpilogMsgTime=2000 <br>" + "#GetEnvTimeout=2 <br>" + "#HealthCheckInterval=0 <br>" + "#HealthCheckProgram= <br>" + diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 5e7da434d253727b7283d2612f40f8e60985152a..bf7a5de154202b6e50c26de4d3d13df526474f16 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -130,6 +130,17 @@ Fully qualified pathname of a script to execute as user root on every node when a user's job completes (e.g. "/usr/local/slurm/epilog"). This may be used to purge files, disable user login, etc. By default there is no epilog. +.TP +\fBEpilogMsgTime\fR +The number of microseconds the the slurmctld daemon requires to process +an epilog completion message from the slurmd dameons. This parameter can +be used to prevent a burst of epilog completion messages from being sent +at the same time which should help prevent lost messages and improve +throughput for large jobs. +The default value is 2000 microseconds. +For a 1000 node job, this spreads the epilog completion messages out over +two seconds. + .TP \fBFastSchedule\fR Controls how a nodes configuration specifications in slurm.conf are used. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 7ea3569bd6368514977684725d22f5278f912502..491c8e9e1322fff4d8068a3bd30ac3f18270a6b9 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -934,10 +934,12 @@ typedef struct slurm_ctl_conf { char *crypto_type; /* cryptographic signature plugin */ uint32_t def_mem_per_task; /* default MB memory per spawned task */ char *epilog; /* pathname of job epilog */ + uint32_t epilog_msg_time; /* usecs for slurmctld to process an + * epilog complete message */ + uint16_t fast_schedule; /* 1 to *not* check configurations by node + * (only check configuration file, faster) */ uint32_t first_job_id; /* first slurm generated job_id to assign */ uint32_t next_job_id; /* next slurm generated job_id to assign */ - uint16_t fast_schedule; /* 1 to *not* check configurations by node - * (only check configuration file, faster) */ uint16_t get_env_timeout; /* timeout for srun --get-user-env option */ uint16_t health_check_interval; /* secs between health checks */ char * health_check_program; /* pathname of health check program */ diff --git a/src/api/config_info.c b/src/api/config_info.c index 5e92cc051b686d2c274903f122b9ddab9f70686d..85e5afe8929ebb859c0a14ffa7931de8694fdf8e 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -144,6 +144,8 @@ void slurm_print_ctl_conf ( FILE* out, fprintf(out, "DefMemPerTask = UNLIMITED\n"); fprintf(out, "Epilog = %s\n", slurm_ctl_conf_ptr->epilog); + fprintf(out, "EpilogMsgTime = %u\n", + slurm_ctl_conf_ptr->epilog_msg_time); fprintf(out, "FastSchedule = %u\n", slurm_ctl_conf_ptr->fast_schedule); fprintf(out, "FirstJobId = %u\n", diff --git a/src/common/read_config.c b/src/common/read_config.c index f9fb7481474b9731e922da1bf5695e0ed9d0ab01..690b365afb6be0b13d7eefb72eb0ea15c77d1c10 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -138,6 +138,7 @@ s_p_options_t slurm_conf_options[] = { {"CryptoType", S_P_STRING}, {"DefMemPerTask", S_P_UINT32}, {"Epilog", S_P_STRING}, + {"EpilogMsgTime", S_P_UINT32}, {"FastSchedule", S_P_UINT16}, {"FirstJobId", S_P_UINT32}, {"HashBase", S_P_LONG, defunct_option}, @@ -1144,6 +1145,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->crypto_type); ctl_conf_ptr->def_mem_per_task = 0; xfree (ctl_conf_ptr->epilog); + ctl_conf_ptr->epilog_msg_time = (uint32_t) NO_VAL; ctl_conf_ptr->fast_schedule = (uint16_t) NO_VAL; ctl_conf_ptr->first_job_id = (uint32_t) NO_VAL; ctl_conf_ptr->health_check_interval = 0; @@ -1516,6 +1518,9 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) s_p_get_string(&conf->epilog, "Epilog", hashtbl); + if (!s_p_get_uint32(&conf->epilog_msg_time, "EpilogMsgTime", hashtbl)) + conf->epilog_msg_time = DEFAULT_EPILOG_MSG_TIME; + if (!s_p_get_uint16(&conf->fast_schedule, "FastSchedule", hashtbl)) conf->fast_schedule = DEFAULT_FAST_SCHEDULE; diff --git a/src/common/read_config.h b/src/common/read_config.h index 768d98c5ef3d7c9fc80cf8a44e100205760111ff..34eea1322d5c5a9d8fb25a8bce6a59523562702a 100644 --- a/src/common/read_config.h +++ b/src/common/read_config.h @@ -51,6 +51,7 @@ extern char *default_plugstack; #define DEFAULT_AUTH_TYPE "auth/none" #define DEFAULT_CACHE_GROUPS 0 #define DEFAULT_CRYPTO_TYPE "crypto/openssl" +#define DEFAULT_EPILOG_MSG_TIME 2000 #define DEFAULT_FAST_SCHEDULE 1 #define DEFAULT_FIRST_JOB_ID 1 #define DEFAULT_GET_ENV_TIMEOUT 2 diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 6528f4d3eb901dec99f6fcdeca7c3dc6949bfa40..67630952fd0080cde93e29afaa67e5575e6cddde 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -211,6 +211,20 @@ uint32_t slurm_get_max_mem_per_task(void) return mem_per_task; } +/* slurm_get_epilog_msg_time + * RET EpilogMsgTime value from slurm.conf + */ +uint32_t slurm_get_epilog_msg_time(void) +{ + uint32_t epilog_msg_time; + slurm_ctl_conf_t *conf; + + conf = slurm_conf_lock(); + epilog_msg_time = conf->epilog_msg_time; + slurm_conf_unlock(); + return epilog_msg_time; +} + /* slurm_get_env_timeout * return default timeout for srun/sbatch --get-user-env option */ diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index fc86cc31f646afe6a7817d9c6fee6f97a5e9bb00..0746e183e7405696227aa42761b6330fdc68b139 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -107,6 +107,11 @@ uint32_t slurm_get_def_mem_per_task(void); */ uint32_t slurm_get_max_mem_per_task(void); +/* slurm_get_epilog_msg_time + * RET EpilogMsgTime value from slurm.conf + */ +uint32_t slurm_get_epilog_msg_time(void); + /* slurm_get_env_timeout * return default timeout for srun/sbatch --get-user-env option */ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 739d87445a2709039415ecb4b65169bc1ac368b2..83ce2b343a7e1bb3a17e093709b10149fb59519b 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2212,6 +2212,7 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) pack32(build_ptr->def_mem_per_task, buffer); packstr(build_ptr->epilog, buffer); + pack32(build_ptr->epilog_msg_time, buffer); pack16(build_ptr->fast_schedule, buffer); pack32(build_ptr->first_job_id, buffer); @@ -2357,6 +2358,7 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** safe_unpack32(&build_ptr->def_mem_per_task, buffer); safe_unpackstr_xmalloc(&build_ptr->epilog, &uint32_tmp, buffer); + safe_unpack32(&build_ptr->epilog_msg_time, buffer); safe_unpack16(&build_ptr->fast_schedule, buffer); safe_unpack32(&build_ptr->first_job_id, buffer); diff --git a/src/plugins/sched/gang/gang.c b/src/plugins/sched/gang/gang.c index b9b102dc7d51710b21519b99c57d86b536b9fcc5..adeadc71cc0caecabb647ce35d13050031c3828b 100644 --- a/src/plugins/sched/gang/gang.c +++ b/src/plugins/sched/gang/gang.c @@ -222,20 +222,15 @@ _load_phys_res_cnt() { int i, array_size = GS_CPU_ARRAY_INCREMENT; uint32_t adder; - if (gs_cpus_per_res) - xfree(gs_cpus_per_res); - gs_cpus_per_res = NULL; - if (gs_cpu_count_reps) - xfree(gs_cpu_count_reps); - gs_cpu_count_reps = NULL; + + xfree(gs_cpus_per_res); + xfree(gs_cpu_count_reps); gs_num_groups = 0; if (gr_type == GS_NODE || gr_type == GS_SOCKET) return; gs_cpus_per_res = xmalloc(array_size * sizeof(uint16_t)); gs_cpu_count_reps = xmalloc(array_size * sizeof(uint32_t)); - gs_cpus_per_res[0] = 0; - gs_cpu_count_reps[0] = 0; for (i = 0; i < node_record_count; i++) { uint16_t res = _compute_resources(i, 0); if (gs_cpus_per_res[gs_num_groups] == res) { @@ -301,31 +296,17 @@ _destroy_parts() { xfree(tmp->part_name); for (i = 0; i < tmp->num_jobs; i++) { j_ptr = tmp->job_list[i]; - if (j_ptr->bitmap) { + if (j_ptr->bitmap) bit_free(j_ptr->bitmap); - j_ptr->bitmap = NULL; - } - if (j_ptr->alloc_cpus) { - xfree(j_ptr->alloc_cpus); - j_ptr->alloc_cpus = NULL; - } + xfree(j_ptr->alloc_cpus); xfree(j_ptr); } - if (tmp->job_list) { - xfree(tmp->job_list); - tmp->job_list = NULL; - } - if (tmp->active_bitmap) { + if (tmp->active_bitmap) bit_free(tmp->active_bitmap); - tmp->active_bitmap = NULL; - } - if (tmp->active_cpus) { - xfree(tmp->active_cpus); - tmp->active_cpus = NULL; - } + xfree(tmp->active_cpus); + xfree(tmp->job_list); } xfree(gs_part_list); - gs_part_list = NULL; } /* just build the gs_part_list. The job_list will be created */ @@ -347,17 +328,10 @@ _build_parts() { fatal ("memory allocation failure"); gs_part_list = xmalloc(num_parts * sizeof(struct gs_part)); - i = 0; while ((p_ptr = (struct part_record *) list_next(part_iterator))) { gs_part_list[i].part_name = xstrdup(p_ptr->name); - gs_part_list[i].num_jobs = 0; - gs_part_list[i].job_list = NULL; - gs_part_list[i].job_list_size = 0; - gs_part_list[i].jobs_running = 0; - gs_part_list[i].active_bitmap = NULL; - gs_part_list[i].array_size = 0; - gs_part_list[i].active_cpus = NULL; + /* everything else is already set to zero/NULL */ gs_part_list[i].next = &(gs_part_list[i+1]); i++; } @@ -563,8 +537,8 @@ static void _load_alloc_cpus(struct gs_job *j_ptr, bitstr_t *nodemap) { int i, a, alloc_index, sz; - if (j_ptr->alloc_cpus) - xfree(j_ptr->alloc_cpus); + + xfree(j_ptr->alloc_cpus); sz = bit_set_count(j_ptr->bitmap); j_ptr->alloc_cpus = xmalloc(sz * sizeof(uint16_t)); @@ -648,8 +622,7 @@ _add_job_to_part(struct gs_part *p_ptr, uint32_t job_id, bitstr_t *job_bitmap) p_ptr->job_list_size = default_job_list_size; p_ptr->job_list = xmalloc(p_ptr->job_list_size * sizeof(struct gs_job *)); - for (i = 0; i < p_ptr->job_list_size; i++) - p_ptr->job_list[i] = NULL; + /* job_list is initialized to be NULL filled */ } /* protect against duplicates */ for (i = 0; i < p_ptr->num_jobs; i++) { @@ -745,10 +718,7 @@ _remove_job_from_part(uint32_t job_id, struct gs_part *p_ptr) _signal_job(j_ptr->job_id, GS_RESUME); } bit_free(j_ptr->bitmap); - j_ptr->bitmap = NULL; - if (j_ptr->alloc_cpus) - xfree(j_ptr->alloc_cpus); - j_ptr->alloc_cpus = NULL; + xfree(j_ptr->alloc_cpus); xfree(j_ptr); /* in order to remove this job from the active row, @@ -914,14 +884,8 @@ gs_fini() pthread_mutex_lock(&data_mutex); _destroy_parts(); - if (gs_cpus_per_res) { - xfree(gs_cpus_per_res); - gs_cpus_per_res = NULL; - } - if (gs_cpu_count_reps) { - xfree(gs_cpu_count_reps); - gs_cpu_count_reps = NULL; - } + xfree(gs_cpus_per_res); + xfree(gs_cpu_count_reps); gs_num_groups = 0; pthread_mutex_unlock(&data_mutex); debug3("sched/gang: leaving gs_fini"); diff --git a/src/plugins/select/cons_res/dist_tasks.c b/src/plugins/select/cons_res/dist_tasks.c index 2ecbb4a621c546350cd9746ac67b4ae25e80face..68c06621862034a8826ad415e3b2e82e521dccdb 100644 --- a/src/plugins/select/cons_res/dist_tasks.c +++ b/src/plugins/select/cons_res/dist_tasks.c @@ -89,7 +89,8 @@ int compute_c_b_task_dist(struct select_cr_job *job) #if (CR_DEBUG) for (i = 0; i < job->nhosts; i++) { - info("cons_res _c_b_task_dist %u host_index %d nprocs %u maxtasks %u cpus %u alloc_cpus %u", + info("cons_res _c_b_task_dist %u host_index %d nprocs %u " + "maxtasks %u cpus %u alloc_cpus %u", job->job_id, i, job->nprocs, maxtasks, job->cpus[i], job->alloc_cpus[i]); } diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 7a99c72b585a79278f1615b6fbae52d2133c73b7..e6c73d456689c3f51bd68009d88bf2cc7f41e4f8 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -329,6 +329,7 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->crypto_type = xstrdup(conf->crypto_type); conf_ptr->def_mem_per_task = conf->def_mem_per_task; conf_ptr->epilog = xstrdup(conf->epilog); + conf_ptr->epilog_msg_time = conf->epilog_msg_time; conf_ptr->fast_schedule = conf->fast_schedule; conf_ptr->first_job_id = conf->first_job_id; conf_ptr->inactive_limit = conf->inactive_limit; diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index b5c5a7daa34ffdef9902e31e608695b53168a7ac..463e976f36347ff60b7d2a61b65d8b387465df98 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -2505,6 +2505,7 @@ static void _sync_messages_kill(kill_job_msg_t *req) int host_cnt, host_inx; char *host; hostset_t hosts; + int epilog_msg_time; hosts = hostset_create(req->nodes); host_cnt = hostset_count(hosts); @@ -2523,7 +2524,8 @@ static void _sync_messages_kill(kill_job_msg_t *req) } free(host); } - _delay_rpc(host_inx, host_cnt, 10000); + epilog_msg_time = slurm_get_epilog_msg_time(); + _delay_rpc(host_inx, host_cnt, epilog_msg_time); fini: hostset_destroy(hosts); }