From 24af1096ead286acef818bc000d45370a76f303e Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 23 Nov 2005 18:47:15 +0000
Subject: [PATCH] svn merge -r6645:6689
 https://eris.llnl.gov/svn/slurm/branches/slurm-0-6-branch

---
 NEWS                                       |  4 ++++
 src/plugins/jobacct/log/jobacct_log.c      |  2 +-
 src/plugins/mpi/mvapich/mvapich.c          |  2 +-
 src/plugins/switch/elan/qsw.c              |  1 +
 src/plugins/switch/federation/federation.c | 12 ++++++++++--
 src/slurmctld/backup.c                     |  5 ++++-
 src/slurmctld/controller.c                 |  7 +++----
 src/slurmctld/step_mgr.c                   |  4 ++++
 src/slurmd/slurmstepd/ulimits.c            |  2 +-
 src/srun/srun.c                            |  8 +++++---
 10 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/NEWS b/NEWS
index 6504499e9f8..2274940a380 100644
--- a/NEWS
+++ b/NEWS
@@ -68,6 +68,10 @@ documents those changes that are of interest to users and admins.
  -- Accounting bug causing segv fixed (Andy Riebs, 14oct.jobacct.patch)
  -- Fix for failed launch of a debugged job (e.g. bad executable name).
  -- Wiki plugin fix for tracking allocated nodes (Ernest Artiaga, BSC).
+ -- Fix memory leaks in slurmctld and federation plugin.
+ -- Fix sefault in federation plugin function fed_libstate_clear().
+ -- Align job accounting data (Andy Riebs, slurm.hp.unal_jobacct.patch)
+ -- Restore switch state in backup controller restarts
 
 * Changes in SLURM 0.6.8
 ========================
diff --git a/src/plugins/jobacct/log/jobacct_log.c b/src/plugins/jobacct/log/jobacct_log.c
index 9e5b0658d87..d1b7315a20e 100644
--- a/src/plugins/jobacct/log/jobacct_log.c
+++ b/src/plugins/jobacct/log/jobacct_log.c
@@ -122,7 +122,7 @@ typedef struct _stats_msg {
 	_stats_msg_type_t	msg_type;
 	uint32_t	 	jobid;   /* in network order! */
 	uint32_t		stepid;  /* ditto */
-	uint16_t		datalen; /* ditto */
+	uint32_t		datalen; /* ditto (and make 'data' aligned) */
 	char			data[MAX_MSG_SIZE];
 } _stats_msg_t;
 
diff --git a/src/plugins/mpi/mvapich/mvapich.c b/src/plugins/mpi/mvapich/mvapich.c
index 59fbfb5126d..6ea53023d25 100644
--- a/src/plugins/mpi/mvapich/mvapich.c
+++ b/src/plugins/mpi/mvapich/mvapich.c
@@ -133,7 +133,7 @@ static struct mvapich_info * mvapich_info_create (int fd)
 
 		mvi->pid = xmalloc (mvi->pidlen);
 
-		if (fd_read_n (fd, &mvi->pid, mvi->pidlen) < 0)
+		if (fd_read_n (fd, mvi->pid, mvi->pidlen) < 0)
 			E_RET ("mvapich: Unable to read pid for rank %d: %m", mvi->rank);
 	}
 
diff --git a/src/plugins/switch/elan/qsw.c b/src/plugins/switch/elan/qsw.c
index 5ceb72406b1..3ad42944ad6 100644
--- a/src/plugins/switch/elan/qsw.c
+++ b/src/plugins/switch/elan/qsw.c
@@ -415,6 +415,7 @@ qsw_clear(void)
 	int rc = 0;
 
 	_lock_qsw();
+	assert(qsw_internal_state);
 	assert(qsw_internal_state->ls_magic == QSW_LIBSTATE_MAGIC);
 	if (qsw_internal_state->step_ctx_list)
 		list_destroy(qsw_internal_state->step_ctx_list);
diff --git a/src/plugins/switch/federation/federation.c b/src/plugins/switch/federation/federation.c
index 6ca50961efb..7c4b615ebdc 100644
--- a/src/plugins/switch/federation/federation.c
+++ b/src/plugins/switch/federation/federation.c
@@ -1712,6 +1712,8 @@ fed_build_jobinfo(fed_jobinfo_t *jp, hostlist_t hl, int nprocs,
 		node = _find_node(fed_state, host);
 		jp->tables_per_task = node ? node->adapter_count : 0;
 		_unlock();
+		if (host != NULL)
+			free(host);
 		hostlist_iterator_reset(hi);
 	} else {
 		jp->tables_per_task = 1;
@@ -2431,6 +2433,7 @@ fed_libstate_restore(Buf buffer)
 	fed_state = _alloc_libstate();
 	if(!fed_state) {
 		error("fed_libstate_restore fed_state is NULL");
+		_unlock();
 		return SLURM_FAILURE;
 	}
 	_unpack_libstate(fed_state, buffer);
@@ -2449,8 +2452,11 @@ fed_libstate_clear(void)
 
 	debug3("Clearing state on all windows in global fed state");
 	_lock();
-	if (!fed_state || !fed_state->node_list)
+	if (!fed_state || !fed_state->node_list) {
+		error("fed_state or node_list not initialized!");
+		_unlock();
 		return SLURM_ERROR;
+	}
 
 	for (i = 0; i < fed_state->node_count; i++) {
 		node = &fed_state->node_list[i];
@@ -2458,10 +2464,12 @@ fed_libstate_clear(void)
 			continue;
 		for (j = 0; j < node->adapter_count; j++) {
 			adapter = &node->adapter_list[i];
-			if (!adapter->window_list)
+			if (!adapter || !adapter->window_list)
 				continue;
 			for (k = 0; k < adapter->window_count; k++) {
 				window = &adapter->window_list[k];
+				if (!window)
+					continue;
 				window->status = NTBL_UNLOADED_STATE;
 			}
 		}
diff --git a/src/slurmctld/backup.c b/src/slurmctld/backup.c
index b1cc7dc67b2..3f0d7e93ad6 100644
--- a/src/slurmctld/backup.c
+++ b/src/slurmctld/backup.c
@@ -159,7 +159,10 @@ void run_backup(void)
 
 	/* clear old state and read new state */
 	job_fini();
-	switch_clear();
+	if (switch_restore(slurmctld_conf.state_save_location, true)) {
+		error("failed to restore switch state");
+		abort();
+	}
 	if (read_slurm_conf(2)) {	/* Recover all state */
 		error("Unable to recover slurm state");
 		abort();
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 70f0a9aa330..2c69abb16d1 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -239,10 +239,6 @@ int main(int argc, char *argv[])
 	if ( checkpoint_init(slurmctld_conf.checkpoint_type) != 
 			SLURM_SUCCESS )
 		fatal( "failed to initialize checkpoint plugin" );
-	error_code = switch_restore(slurmctld_conf.state_save_location,
-				    recover ? true : false);
-	if ( error_code != 0)
-		fatal(" failed to initialize switch plugin" );
 	if (select_g_state_restore(slurmctld_conf.state_save_location))
 		fatal( "failed to restore node selection plugin state");
 
@@ -261,6 +257,9 @@ int main(int argc, char *argv[])
 			  == 0)) {
 			(void) _shutdown_backup_controller(SHUTDOWN_WAIT);
 			/* Now recover the remaining state information */
+			if (switch_restore(slurmctld_conf.state_save_location,
+					 recover ? true : false))
+				fatal(" failed to initialize switch plugin" );
 			if ((error_code = read_slurm_conf(recover))) {
 				fatal("read_slurm_conf reading %s: %s",
 					slurmctld_conf.slurm_conf,
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 47aee9840af..df83c9a9591 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -110,6 +110,8 @@ delete_all_step_records (struct job_record *job_ptr)
 		xfree(step_ptr->name);
 		xfree(step_ptr->step_node_list);
 		FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
+		if (step_ptr->network)
+			xfree(step_ptr->network);
 		xfree(step_ptr);
 	}		
 
@@ -153,6 +155,8 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id)
 			xfree(step_ptr->name);
 			xfree(step_ptr->step_node_list);
 			FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
+			if (step_ptr->network)
+				xfree(step_ptr->network);
 			xfree(step_ptr);
 			error_code = 0;
 			break;
diff --git a/src/slurmd/slurmstepd/ulimits.c b/src/slurmd/slurmstepd/ulimits.c
index 9529f05e833..6e7d2f02af0 100644
--- a/src/slurmd/slurmstepd/ulimits.c
+++ b/src/slurmd/slurmstepd/ulimits.c
@@ -98,7 +98,7 @@ _set_umask(char **env)
 		return SLURM_ERROR;
 	}
 
-	mask = atoi(val);
+	mask = strtol(val, (char **)NULL, 8);
 	umask(mask);
 }
 
diff --git a/src/srun/srun.c b/src/srun/srun.c
index 8e414e95a10..eac7e904c91 100644
--- a/src/srun/srun.c
+++ b/src/srun/srun.c
@@ -718,15 +718,17 @@ _build_script (char *fname, int file_type)
 /* Set SLURM_UMASK environment variable with current state */
 static int _set_umask_env(void)
 {
+	char mask_char[5];
 	mode_t mask = (int)umask(0);
 	umask(mask);
 
-	if (setenvf(NULL, "SLURM_UMASK", "%d", (int)mask) < 0) {
+	sprintf(mask_char, "0%d%d%d", 
+		((mask>>6)&07), ((mask>>3)&07), mask&07);
+	if (setenvf(NULL, "SLURM_UMASK", "%s", mask_char) < 0) {
 		error ("unable to set SLURM_UMASK in environment");
 		return SLURM_FAILURE;
 	}
-	debug ("propagating UMASK=0%d%d%d", 
-		((mask>>6)&07), ((mask>>3)&07), mask&07);
+	debug ("propagating UMASK=%s", mask_char); 
 	return SLURM_SUCCESS;
 }
 
-- 
GitLab