diff --git a/NEWS b/NEWS index 8ceb24d8d97ce6fe5c37b8c45e6fb2aebf05cf9a..c63efe418a84542c32a00f11a6c7cfada7827f81 100644 --- a/NEWS +++ b/NEWS @@ -87,6 +87,8 @@ documents those changes that are of interest to users and admins. - In sched/wiki2, add support for EHost and EHostBackup configuration parameters in wiki.conf file - In sched/wiki2, fix memory management bug for JOBWILLRUN command. + - In sched/wiki2, consider job Busy while in Completing state for + KillWait+10 seconds (used to be 30 seconds). * Changes in SLURM 1.1.17 ========================= diff --git a/src/plugins/sched/wiki2/event.c b/src/plugins/sched/wiki2/event.c index 4e57f732b005442e1a6075a3f5ac02d24809c2a4..86b9a65bcc91274967b4ccf82cb680919454cc26 100644 --- a/src/plugins/sched/wiki2/event.c +++ b/src/plugins/sched/wiki2/event.c @@ -68,12 +68,6 @@ extern int event_notify(char *msg) pthread_mutex_lock(&event_mutex); if (event_addr_set == 0) { /* Identify address for socket connection */ - if (e_host[0] == '\0') { - slurm_ctl_conf_t *conf = slurm_conf_lock(); - strncpy(e_host, conf->control_addr, - sizeof(e_host)); - slurm_conf_unlock(); - } slurm_set_addr(&moab_event_addr, e_port, e_host); event_addr_set = 1; if (e_host_bu[0] != '\0') { diff --git a/src/plugins/sched/wiki2/get_jobs.c b/src/plugins/sched/wiki2/get_jobs.c index 748a40be525fe1deecf8925e4dd93b06e0dca56f..3178d8c9af92e84727507ba7e32018f19af1fab9 100644 --- a/src/plugins/sched/wiki2/get_jobs.c +++ b/src/plugins/sched/wiki2/get_jobs.c @@ -319,15 +319,13 @@ static char * _get_job_state(struct job_record *job_ptr) return "Running"; if (state & JOB_COMPLETING) { - /* Give 60 seconds to clear out, then - * then consider job done. Let Moab - * deal with inconsistency between - * job state (DONE) and node state - * (some IDLE and others still - * BUSY). */ + /* Give configured KillWait+10 for job + * to clear out, then then consider job + * done. Moab will allocate jobs to + * nodes that are already Idle. */ int age = (int) difftime(time(NULL), job_ptr->end_time); - if (age < 60) + if (age < (kill_wait+10)) return "Running"; } diff --git a/src/plugins/sched/wiki2/msg.c b/src/plugins/sched/wiki2/msg.c index e933da77e80bae2eb0371446724195ec439cf303..34d1f92dd666cc947a908044d00c22719a28206e 100644 --- a/src/plugins/sched/wiki2/msg.c +++ b/src/plugins/sched/wiki2/msg.c @@ -53,6 +53,7 @@ char e_host_bu[E_HOST_SIZE] = ""; uint16_t e_port = 0; uint16_t job_aggregation_time = 10; /* Default value is 10 seconds */ int init_prio_mode = PRIO_HOLD; +uint16_t kill_wait; uint16_t use_host_exp = 0; static char * _get_wiki_conf_path(void); @@ -205,6 +206,13 @@ static void _parse_wiki_config(void) s_p_hashtbl_t *tbl; char *key = NULL, *priority_mode = NULL, *wiki_conf; struct stat buf; + slurm_ctl_conf_t *conf; + + /* Set default values */ + conf = slurm_conf_lock(); + strncpy(e_host, conf->control_addr, sizeof(e_host)); + kill_wait = conf->kill_wait; + slurm_conf_unlock(); wiki_conf = _get_wiki_conf_path(); if ((wiki_conf == NULL) || (stat(wiki_conf, &buf) == -1)) { @@ -227,7 +235,8 @@ static void _parse_wiki_config(void) if ( s_p_get_string(&key, "EHost", tbl)) { strncpy(e_host, key, sizeof(e_host)); xfree(key); - } + } else + debug("wiki: Using ControlAddr for EHost value"); if ( s_p_get_string(&key, "EHostBackup", tbl)) { strncpy(e_host_bu, key, sizeof(e_host_bu)); xfree(key); @@ -247,6 +256,15 @@ static void _parse_wiki_config(void) s_p_hashtbl_destroy(tbl); xfree(wiki_conf); +#if 0 + info("AuthKey = %s", auth_key); + info("EHost = %s", e_host); + info("EHostBackup = %s", e_host_bu); + info("EPort = %u", e_port); + info("JobAggregationTime = %u sec", job_aggregation_time); + info("JobPriority = %s", init_prio_mode ? "run" : "hold"); + info("KillWait = %u sec", kill_wait); +#endif return; } diff --git a/src/plugins/sched/wiki2/msg.h b/src/plugins/sched/wiki2/msg.h index c46254385d02e93896441ec83e83de4d4eaac4e9..0c362c145d2439909182b25b588b200ccb20e503 100644 --- a/src/plugins/sched/wiki2/msg.h +++ b/src/plugins/sched/wiki2/msg.h @@ -91,6 +91,7 @@ extern char e_host[E_HOST_SIZE]; extern char e_host_bu[E_HOST_SIZE]; extern uint16_t e_port; extern uint16_t job_aggregation_time; +extern uint16_t kill_wait; extern uint16_t use_host_exp; extern int event_notify(char *msg);