diff --git a/NEWS b/NEWS index 598122b278818f2cdd5e771e322bbc01c5d3b9fe..8ceb24d8d97ce6fe5c37b8c45e6fb2aebf05cf9a 100644 --- a/NEWS +++ b/NEWS @@ -82,6 +82,12 @@ documents those changes that are of interest to users and admins. the code) -- Added support for OSX build. +* Changes in SLURM 1.1.18 +========================= + - In sched/wiki2, add support for EHost and EHostBackup configuration + parameters in wiki.conf file + - In sched/wiki2, fix memory management bug for JOBWILLRUN command. + * Changes in SLURM 1.1.17 ========================= - BLUEGENE - fix to make dynamic partitioning not go create block where diff --git a/doc/man/man5/wiki.conf.5 b/doc/man/man5/wiki.conf.5 index 56d49ff55103e7510f00efb74ba1b122fdbc7ad7..772ff3c0f46dbadbd9a4d2c36310875243c68df7 100644 --- a/doc/man/man5/wiki.conf.5 +++ b/doc/man/man5/wiki.conf.5 @@ -1,4 +1,4 @@ -.TH "wiki.conf" "5" "September 2006" "wiki.conf 1.1" "Slurm configuration file" +.TH "wiki.conf" "5" "October 2006" "wiki.conf 1.1" "Slurm configuration file" .SH "NAME" wiki.conf \- Slurm configuration file for wiki scheduler plugin .SH "DESCRIPTION" @@ -21,6 +21,20 @@ Authentication key for communications. This numeric value should match KEY configured in the \fBmoab\-private.cnf\fR file. +.TP +\fBEHost\fR +Name the computer on which Moab server executes. +It is used in establishing a communications path for event notification. +By default the \fBEHost\fR will be identical in value to the +\fBControlAddr\fR configured in slurm.conf. + +.TP +\fBEHostBackup\fR +Name the computer on which the backup Moab server executes. +It is used in establishing a communications path for event notification. +There is no default value for \fBEHostBackup\fR (no backup +controller is configured). + .TP \fBEPort\fR Port to be used to notify Moab of events (job submitted to Slurm, @@ -77,6 +91,12 @@ AuthKey=1234 .br EPort=15017 .br +# Moab event notification hosts, where Moab executes +.br +EHost=tux0 +.br +EHostBackup=tux1 +.br # Moab event notifcation throttle, matches JOBAGGREGATIONTIME .br # in moab.cfg (integer value in seconds) diff --git a/src/plugins/sched/wiki2/event.c b/src/plugins/sched/wiki2/event.c index fe7817d12f1c69cb27c68ca3ac4eb421c1ce94b1..4e57f732b005442e1a6075a3f5ac02d24809c2a4 100644 --- a/src/plugins/sched/wiki2/event.c +++ b/src/plugins/sched/wiki2/event.c @@ -40,9 +40,8 @@ static pthread_mutex_t event_mutex = PTHREAD_MUTEX_INITIALIZER; static time_t last_notify_time = (time_t) 0; -static slurm_addr moab_event_addr; +static slurm_addr moab_event_addr, moab_event_addr_bu; static int event_addr_set = 0; -static char * control_addr = NULL; /* * event_notify - Notify Moab of some event @@ -69,16 +68,31 @@ extern int event_notify(char *msg) pthread_mutex_lock(&event_mutex); if (event_addr_set == 0) { /* Identify address for socket connection */ - slurm_ctl_conf_t *conf = slurm_conf_lock(); - control_addr = xstrdup(conf->control_addr); - slurm_conf_unlock(); - slurm_set_addr(&moab_event_addr, e_port, control_addr); + if (e_host[0] == '\0') { + slurm_ctl_conf_t *conf = slurm_conf_lock(); + strncpy(e_host, conf->control_addr, + sizeof(e_host)); + slurm_conf_unlock(); + } + slurm_set_addr(&moab_event_addr, e_port, e_host); event_addr_set = 1; + if (e_host_bu[0] != '\0') { + slurm_set_addr(&moab_event_addr_bu, e_port, + e_host_bu); + event_addr_set = 2; + } } event_fd = slurm_open_msg_conn(&moab_event_addr); + if ((event_fd == -1) && (event_addr_set == 2)) + event_fd = slurm_open_msg_conn(&moab_event_addr_bu); if (event_fd == -1) { + char *host_name; + if (event_addr_set == 2) + host_name = e_host_bu; + else + host_name = e_host; error("Unable to open wiki event port %s:%u: %m", - control_addr, e_port); + host_name, e_port); pthread_mutex_unlock(&event_mutex); return -1; } diff --git a/src/plugins/sched/wiki2/job_will_run.c b/src/plugins/sched/wiki2/job_will_run.c index ce260c6331a7312debbf9fb65230cede74bc97a6..05293819425106e4127589a6e2a8a33bc045a30e 100644 --- a/src/plugins/sched/wiki2/job_will_run.c +++ b/src/plugins/sched/wiki2/job_will_run.c @@ -108,9 +108,11 @@ static int _will_run_test(uint32_t jobid, char *hostlist, char *new_node_list, *picked_node_list = NULL; bitstr_t *new_bitmap, *save_exc_bitmap, *save_req_bitmap; uint32_t save_prio; - static char *reply_msg; - static int reply_msg_size = 0; bitstr_t *picked_node_bitmap = NULL; + /* Just create a big static message buffer to avoid dealing with + * xmalloc/xfree. We'll switch to compressed node naming soon + * and this buffer can be set smaller then. */ + static char reply_msg[16384]; lock_slurmctld(job_write_lock); job_ptr = find_job_record(jobid); @@ -172,21 +174,19 @@ static int _will_run_test(uint32_t jobid, char *hostlist, if (picked_node_bitmap) { picked_node_list = bitmap2wiki_node_name(picked_node_bitmap); i = strlen(picked_node_list); - if ((i + 64) > reply_msg_size) { - reply_msg_size = i + 1024; - xrealloc(reply_msg, reply_msg_size); - } + if ((i + 64) > sizeof(reply_msg)) + error("wiki: will_run buffer overflow"); } if (rc == SLURM_SUCCESS) { *err_code = 0; - snprintf(reply_msg, reply_msg_size, + snprintf(reply_msg, sizeof(reply_msg), "SC=0 Job %d runnable now TASKLIST:%s", jobid, picked_node_list); *err_msg = reply_msg; } else if (rc == ESLURM_NODES_BUSY) { *err_code = 1; - snprintf(reply_msg, reply_msg_size, + snprintf(reply_msg, sizeof(reply_msg), "SC=1 Job %d runnable later TASKLIST:%s", jobid, picked_node_list); *err_msg = reply_msg; @@ -195,7 +195,7 @@ static int _will_run_test(uint32_t jobid, char *hostlist, error("wiki: job %d never runnable on hosts=%s %s", jobid, new_node_list, err_str); *err_code = -740; - snprintf(reply_msg, reply_msg_size, + snprintf(reply_msg, sizeof(reply_msg), "SC=-740 Job %d not runable: %s", jobid, err_str); *err_msg = reply_msg; diff --git a/src/plugins/sched/wiki2/msg.c b/src/plugins/sched/wiki2/msg.c index 40916cffa5351fa3799cbc4a0a6099a093022811..e933da77e80bae2eb0371446724195ec439cf303 100644 --- a/src/plugins/sched/wiki2/msg.c +++ b/src/plugins/sched/wiki2/msg.c @@ -48,6 +48,8 @@ static int err_code; /* Global configuration parameters */ char auth_key[KEY_SIZE] = ""; +char e_host[E_HOST_SIZE] = ""; +char e_host_bu[E_HOST_SIZE] = ""; uint16_t e_port = 0; uint16_t job_aggregation_time = 10; /* Default value is 10 seconds */ int init_prio_mode = PRIO_HOLD; @@ -194,6 +196,8 @@ static void _parse_wiki_config(void) { s_p_options_t options[] = { {"AuthKey", S_P_STRING}, + {"EHost", S_P_STRING}, + {"EHostBackup", S_P_STRING}, {"EPort", S_P_UINT16}, {"JobAggregationTime", S_P_UINT16}, {"JobPriority", S_P_STRING}, @@ -220,6 +224,14 @@ static void _parse_wiki_config(void) strncpy(auth_key, key, sizeof(auth_key)); xfree(key); } + if ( s_p_get_string(&key, "EHost", tbl)) { + strncpy(e_host, key, sizeof(e_host)); + xfree(key); + } + if ( s_p_get_string(&key, "EHostBackup", tbl)) { + strncpy(e_host_bu, key, sizeof(e_host_bu)); + xfree(key); + } s_p_get_uint16(&e_port, "EPort", tbl); s_p_get_uint16(&job_aggregation_time, "JobAggregationTime", tbl); @@ -459,8 +471,7 @@ static void _proc_msg(slurm_fd new_fd, char *msg) } else if (strncmp(cmd_ptr, "JOBRELEASETASK", 14) == 0) { job_release_task(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "JOBWILLRUN", 10) == 0) { - if (!job_will_run(cmd_ptr, &err_code, &err_msg)) - goto free_resp_msg; + job_will_run(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "JOBMODIFY", 9) == 0) { job_modify_wiki(cmd_ptr, &err_code, &err_msg); } else if (strncmp(cmd_ptr, "JOBSIGNAL", 9) == 0) { diff --git a/src/plugins/sched/wiki2/msg.h b/src/plugins/sched/wiki2/msg.h index 431918187953d239e2a817ce12c803b196e06b77..c46254385d02e93896441ec83e83de4d4eaac4e9 100644 --- a/src/plugins/sched/wiki2/msg.h +++ b/src/plugins/sched/wiki2/msg.h @@ -81,11 +81,14 @@ #include "src/common/xstring.h" /* Global configuration parameters */ +#define E_HOST_SIZE 256 #define KEY_SIZE 32 #define PRIO_HOLD 0 #define PRIO_DECREMENT 1 extern int init_prio_mode; extern char auth_key[KEY_SIZE]; +extern char e_host[E_HOST_SIZE]; +extern char e_host_bu[E_HOST_SIZE]; extern uint16_t e_port; extern uint16_t job_aggregation_time; extern uint16_t use_host_exp;