From 1d4e932b398882cbc4124a1068f0541346176fd8 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 24 Oct 2006 16:52:36 +0000
Subject: [PATCH] svn merge -r9886:9906
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1

---
 NEWS                                   |  6 ++++++
 doc/man/man5/wiki.conf.5               | 22 +++++++++++++++++++-
 src/plugins/sched/wiki2/event.c        | 28 +++++++++++++++++++-------
 src/plugins/sched/wiki2/job_will_run.c | 18 ++++++++---------
 src/plugins/sched/wiki2/msg.c          | 15 ++++++++++++--
 src/plugins/sched/wiki2/msg.h          |  3 +++
 6 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/NEWS b/NEWS
index 598122b2788..8ceb24d8d97 100644
--- a/NEWS
+++ b/NEWS
@@ -82,6 +82,12 @@ documents those changes that are of interest to users and admins.
     the code)
  -- Added support for OSX build.
 
+* Changes in SLURM 1.1.18
+=========================
+ - In sched/wiki2, add support for EHost and EHostBackup configuration 
+   parameters in wiki.conf file
+ - In sched/wiki2, fix memory management bug for JOBWILLRUN command.
+
 * Changes in SLURM 1.1.17
 =========================
  - BLUEGENE - fix to make dynamic partitioning not go create block where
diff --git a/doc/man/man5/wiki.conf.5 b/doc/man/man5/wiki.conf.5
index 56d49ff5510..772ff3c0f46 100644
--- a/doc/man/man5/wiki.conf.5
+++ b/doc/man/man5/wiki.conf.5
@@ -1,4 +1,4 @@
-.TH "wiki.conf" "5" "September 2006" "wiki.conf 1.1" "Slurm configuration file"
+.TH "wiki.conf" "5" "October 2006" "wiki.conf 1.1" "Slurm configuration file"
 .SH "NAME"
 wiki.conf \- Slurm configuration file for wiki scheduler plugin
 .SH "DESCRIPTION"
@@ -21,6 +21,20 @@ Authentication key for communications.
 This numeric value should match KEY configured in the 
 \fBmoab\-private.cnf\fR file.
 
+.TP
+\fBEHost\fR
+Name the computer on which Moab server executes.
+It is used in establishing a communications path for event notification. 
+By default the \fBEHost\fR will be identical in value to the 
+\fBControlAddr\fR configured in slurm.conf.
+
+.TP
+\fBEHostBackup\fR
+Name the computer on which the backup Moab server executes.
+It is used in establishing a communications path for event notification.
+There is no default value for \fBEHostBackup\fR (no backup 
+controller is configured).
+
 .TP
 \fBEPort\fR
 Port to be used to notify Moab of events (job submitted to Slurm, 
@@ -77,6 +91,12 @@ AuthKey=1234
 .br
 EPort=15017
 .br
+# Moab event notification hosts, where Moab executes
+.br
+EHost=tux0
+.br
+EHostBackup=tux1
+.br
 # Moab event notifcation throttle, matches JOBAGGREGATIONTIME 
 .br
 # in moab.cfg (integer value in seconds)
diff --git a/src/plugins/sched/wiki2/event.c b/src/plugins/sched/wiki2/event.c
index fe7817d12f1..4e57f732b00 100644
--- a/src/plugins/sched/wiki2/event.c
+++ b/src/plugins/sched/wiki2/event.c
@@ -40,9 +40,8 @@
 
 static pthread_mutex_t	event_mutex = PTHREAD_MUTEX_INITIALIZER;
 static time_t		last_notify_time = (time_t) 0;
-static slurm_addr	moab_event_addr;
+static slurm_addr	moab_event_addr,  moab_event_addr_bu;
 static int		event_addr_set = 0;
-static char *		control_addr = NULL;
 
 /*
  * event_notify - Notify Moab of some event
@@ -69,16 +68,31 @@ extern int	event_notify(char *msg)
 	pthread_mutex_lock(&event_mutex);
 	if (event_addr_set == 0) {
 		/* Identify address for socket connection */
-		slurm_ctl_conf_t *conf = slurm_conf_lock();
-		control_addr = xstrdup(conf->control_addr);
-		slurm_conf_unlock();
-		slurm_set_addr(&moab_event_addr, e_port, control_addr);
+		if (e_host[0] == '\0') {
+			slurm_ctl_conf_t *conf = slurm_conf_lock();
+			strncpy(e_host, conf->control_addr, 
+				sizeof(e_host));
+			slurm_conf_unlock();
+		}
+		slurm_set_addr(&moab_event_addr, e_port, e_host);
 		event_addr_set = 1;
+		if (e_host_bu[0] != '\0') {
+			slurm_set_addr(&moab_event_addr_bu, e_port, 
+				e_host_bu);
+			event_addr_set = 2;
+		}
 	}
 	event_fd = slurm_open_msg_conn(&moab_event_addr);
+	if ((event_fd == -1) && (event_addr_set == 2))
+		event_fd = slurm_open_msg_conn(&moab_event_addr_bu);
 	if (event_fd == -1) {
+		char *host_name;
+		if (event_addr_set == 2)
+			host_name = e_host_bu;
+		else
+			host_name = e_host;
 		error("Unable to open wiki event port %s:%u: %m", 
-			control_addr, e_port);
+			host_name, e_port);
 		pthread_mutex_unlock(&event_mutex);
 		return -1;
 	}
diff --git a/src/plugins/sched/wiki2/job_will_run.c b/src/plugins/sched/wiki2/job_will_run.c
index ce260c6331a..05293819425 100644
--- a/src/plugins/sched/wiki2/job_will_run.c
+++ b/src/plugins/sched/wiki2/job_will_run.c
@@ -108,9 +108,11 @@ static int	_will_run_test(uint32_t jobid, char *hostlist,
 	char *new_node_list, *picked_node_list = NULL;
 	bitstr_t *new_bitmap, *save_exc_bitmap, *save_req_bitmap;
 	uint32_t save_prio;
-	static char *reply_msg;
-	static int reply_msg_size = 0;
 	bitstr_t *picked_node_bitmap = NULL;
+	/* Just create a big static message buffer to avoid dealing with
+	 * xmalloc/xfree. We'll switch to compressed node naming soon
+	 * and this buffer can be set smaller then. */
+	static char reply_msg[16384];
 
 	lock_slurmctld(job_write_lock);
 	job_ptr = find_job_record(jobid);
@@ -172,21 +174,19 @@ static int	_will_run_test(uint32_t jobid, char *hostlist,
 	if (picked_node_bitmap) {
 		picked_node_list = bitmap2wiki_node_name(picked_node_bitmap);
 		i = strlen(picked_node_list);
-		if ((i + 64) > reply_msg_size) {
-			reply_msg_size = i + 1024;
-			xrealloc(reply_msg, reply_msg_size);
-		}
+		if ((i + 64) > sizeof(reply_msg))
+			error("wiki: will_run buffer overflow");
 	}
 
 	if (rc == SLURM_SUCCESS) {
 		*err_code = 0;
-		snprintf(reply_msg, reply_msg_size,
+		snprintf(reply_msg, sizeof(reply_msg),
 			"SC=0 Job %d runnable now TASKLIST:%s",
 			jobid, picked_node_list);
 		*err_msg = reply_msg;
 	} else if (rc == ESLURM_NODES_BUSY) {
 		*err_code = 1;
-		snprintf(reply_msg, reply_msg_size,
+		snprintf(reply_msg, sizeof(reply_msg),
 			"SC=1 Job %d runnable later TASKLIST:%s",
 			jobid, picked_node_list);
 		*err_msg = reply_msg;
@@ -195,7 +195,7 @@ static int	_will_run_test(uint32_t jobid, char *hostlist,
 		error("wiki: job %d never runnable on hosts=%s %s", 
 			jobid, new_node_list, err_str);
 		*err_code = -740;
-		snprintf(reply_msg, reply_msg_size, 
+		snprintf(reply_msg, sizeof(reply_msg), 
 			"SC=-740 Job %d not runable: %s", 
 			jobid, err_str);
 		*err_msg = reply_msg;
diff --git a/src/plugins/sched/wiki2/msg.c b/src/plugins/sched/wiki2/msg.c
index 40916cffa53..e933da77e80 100644
--- a/src/plugins/sched/wiki2/msg.c
+++ b/src/plugins/sched/wiki2/msg.c
@@ -48,6 +48,8 @@ static int   err_code;
 
 /* Global configuration parameters */
 char     auth_key[KEY_SIZE] = "";
+char     e_host[E_HOST_SIZE] = "";
+char     e_host_bu[E_HOST_SIZE] = "";
 uint16_t e_port = 0;
 uint16_t job_aggregation_time = 10;	/* Default value is 10 seconds */
 int      init_prio_mode = PRIO_HOLD;
@@ -194,6 +196,8 @@ static void _parse_wiki_config(void)
 {
 	s_p_options_t options[] = {
 		{"AuthKey", S_P_STRING},
+		{"EHost", S_P_STRING},
+		{"EHostBackup", S_P_STRING},
 		{"EPort", S_P_UINT16},
 		{"JobAggregationTime", S_P_UINT16},
 		{"JobPriority", S_P_STRING}, 
@@ -220,6 +224,14 @@ static void _parse_wiki_config(void)
 		strncpy(auth_key, key, sizeof(auth_key));
 		xfree(key);
 	}
+	if ( s_p_get_string(&key, "EHost", tbl)) {
+		strncpy(e_host, key, sizeof(e_host));
+		xfree(key);
+	}
+	if ( s_p_get_string(&key, "EHostBackup", tbl)) {
+		strncpy(e_host_bu, key, sizeof(e_host_bu));
+		xfree(key);
+	}
 	s_p_get_uint16(&e_port, "EPort", tbl);
 	s_p_get_uint16(&job_aggregation_time, "JobAggregationTime", tbl); 
 
@@ -459,8 +471,7 @@ static void	_proc_msg(slurm_fd new_fd, char *msg)
 	} else if (strncmp(cmd_ptr, "JOBRELEASETASK", 14) == 0) {
 		job_release_task(cmd_ptr, &err_code, &err_msg);
 	} else if (strncmp(cmd_ptr, "JOBWILLRUN", 10) == 0) {
-		if (!job_will_run(cmd_ptr, &err_code, &err_msg))
-			goto free_resp_msg;
+		job_will_run(cmd_ptr, &err_code, &err_msg);
 	} else if (strncmp(cmd_ptr, "JOBMODIFY", 9) == 0) {
 		job_modify_wiki(cmd_ptr, &err_code, &err_msg);
 	} else if (strncmp(cmd_ptr, "JOBSIGNAL", 9) == 0) {
diff --git a/src/plugins/sched/wiki2/msg.h b/src/plugins/sched/wiki2/msg.h
index 43191818795..c46254385d0 100644
--- a/src/plugins/sched/wiki2/msg.h
+++ b/src/plugins/sched/wiki2/msg.h
@@ -81,11 +81,14 @@
 #include "src/common/xstring.h"
 
 /* Global configuration parameters */
+#define E_HOST_SIZE  256
 #define KEY_SIZE      32
 #define PRIO_HOLD      0
 #define PRIO_DECREMENT 1
 extern int	init_prio_mode;
 extern char 	auth_key[KEY_SIZE];
+extern char	e_host[E_HOST_SIZE];
+extern char	e_host_bu[E_HOST_SIZE];
 extern uint16_t	e_port;
 extern uint16_t	job_aggregation_time;
 extern uint16_t use_host_exp;
-- 
GitLab