From 24af1096ead286acef818bc000d45370a76f303e Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 23 Nov 2005 18:47:15 +0000 Subject: [PATCH] svn merge -r6645:6689 https://eris.llnl.gov/svn/slurm/branches/slurm-0-6-branch --- NEWS | 4 ++++ src/plugins/jobacct/log/jobacct_log.c | 2 +- src/plugins/mpi/mvapich/mvapich.c | 2 +- src/plugins/switch/elan/qsw.c | 1 + src/plugins/switch/federation/federation.c | 12 ++++++++++-- src/slurmctld/backup.c | 5 ++++- src/slurmctld/controller.c | 7 +++---- src/slurmctld/step_mgr.c | 4 ++++ src/slurmd/slurmstepd/ulimits.c | 2 +- src/srun/srun.c | 8 +++++--- 10 files changed, 34 insertions(+), 13 deletions(-) diff --git a/NEWS b/NEWS index 6504499e9f8..2274940a380 100644 --- a/NEWS +++ b/NEWS @@ -68,6 +68,10 @@ documents those changes that are of interest to users and admins. -- Accounting bug causing segv fixed (Andy Riebs, 14oct.jobacct.patch) -- Fix for failed launch of a debugged job (e.g. bad executable name). -- Wiki plugin fix for tracking allocated nodes (Ernest Artiaga, BSC). + -- Fix memory leaks in slurmctld and federation plugin. + -- Fix sefault in federation plugin function fed_libstate_clear(). + -- Align job accounting data (Andy Riebs, slurm.hp.unal_jobacct.patch) + -- Restore switch state in backup controller restarts * Changes in SLURM 0.6.8 ======================== diff --git a/src/plugins/jobacct/log/jobacct_log.c b/src/plugins/jobacct/log/jobacct_log.c index 9e5b0658d87..d1b7315a20e 100644 --- a/src/plugins/jobacct/log/jobacct_log.c +++ b/src/plugins/jobacct/log/jobacct_log.c @@ -122,7 +122,7 @@ typedef struct _stats_msg { _stats_msg_type_t msg_type; uint32_t jobid; /* in network order! */ uint32_t stepid; /* ditto */ - uint16_t datalen; /* ditto */ + uint32_t datalen; /* ditto (and make 'data' aligned) */ char data[MAX_MSG_SIZE]; } _stats_msg_t; diff --git a/src/plugins/mpi/mvapich/mvapich.c b/src/plugins/mpi/mvapich/mvapich.c index 59fbfb5126d..6ea53023d25 100644 --- a/src/plugins/mpi/mvapich/mvapich.c +++ b/src/plugins/mpi/mvapich/mvapich.c @@ -133,7 +133,7 @@ static struct mvapich_info * mvapich_info_create (int fd) mvi->pid = xmalloc (mvi->pidlen); - if (fd_read_n (fd, &mvi->pid, mvi->pidlen) < 0) + if (fd_read_n (fd, mvi->pid, mvi->pidlen) < 0) E_RET ("mvapich: Unable to read pid for rank %d: %m", mvi->rank); } diff --git a/src/plugins/switch/elan/qsw.c b/src/plugins/switch/elan/qsw.c index 5ceb72406b1..3ad42944ad6 100644 --- a/src/plugins/switch/elan/qsw.c +++ b/src/plugins/switch/elan/qsw.c @@ -415,6 +415,7 @@ qsw_clear(void) int rc = 0; _lock_qsw(); + assert(qsw_internal_state); assert(qsw_internal_state->ls_magic == QSW_LIBSTATE_MAGIC); if (qsw_internal_state->step_ctx_list) list_destroy(qsw_internal_state->step_ctx_list); diff --git a/src/plugins/switch/federation/federation.c b/src/plugins/switch/federation/federation.c index 6ca50961efb..7c4b615ebdc 100644 --- a/src/plugins/switch/federation/federation.c +++ b/src/plugins/switch/federation/federation.c @@ -1712,6 +1712,8 @@ fed_build_jobinfo(fed_jobinfo_t *jp, hostlist_t hl, int nprocs, node = _find_node(fed_state, host); jp->tables_per_task = node ? node->adapter_count : 0; _unlock(); + if (host != NULL) + free(host); hostlist_iterator_reset(hi); } else { jp->tables_per_task = 1; @@ -2431,6 +2433,7 @@ fed_libstate_restore(Buf buffer) fed_state = _alloc_libstate(); if(!fed_state) { error("fed_libstate_restore fed_state is NULL"); + _unlock(); return SLURM_FAILURE; } _unpack_libstate(fed_state, buffer); @@ -2449,8 +2452,11 @@ fed_libstate_clear(void) debug3("Clearing state on all windows in global fed state"); _lock(); - if (!fed_state || !fed_state->node_list) + if (!fed_state || !fed_state->node_list) { + error("fed_state or node_list not initialized!"); + _unlock(); return SLURM_ERROR; + } for (i = 0; i < fed_state->node_count; i++) { node = &fed_state->node_list[i]; @@ -2458,10 +2464,12 @@ fed_libstate_clear(void) continue; for (j = 0; j < node->adapter_count; j++) { adapter = &node->adapter_list[i]; - if (!adapter->window_list) + if (!adapter || !adapter->window_list) continue; for (k = 0; k < adapter->window_count; k++) { window = &adapter->window_list[k]; + if (!window) + continue; window->status = NTBL_UNLOADED_STATE; } } diff --git a/src/slurmctld/backup.c b/src/slurmctld/backup.c index b1cc7dc67b2..3f0d7e93ad6 100644 --- a/src/slurmctld/backup.c +++ b/src/slurmctld/backup.c @@ -159,7 +159,10 @@ void run_backup(void) /* clear old state and read new state */ job_fini(); - switch_clear(); + if (switch_restore(slurmctld_conf.state_save_location, true)) { + error("failed to restore switch state"); + abort(); + } if (read_slurm_conf(2)) { /* Recover all state */ error("Unable to recover slurm state"); abort(); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 70f0a9aa330..2c69abb16d1 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -239,10 +239,6 @@ int main(int argc, char *argv[]) if ( checkpoint_init(slurmctld_conf.checkpoint_type) != SLURM_SUCCESS ) fatal( "failed to initialize checkpoint plugin" ); - error_code = switch_restore(slurmctld_conf.state_save_location, - recover ? true : false); - if ( error_code != 0) - fatal(" failed to initialize switch plugin" ); if (select_g_state_restore(slurmctld_conf.state_save_location)) fatal( "failed to restore node selection plugin state"); @@ -261,6 +257,9 @@ int main(int argc, char *argv[]) == 0)) { (void) _shutdown_backup_controller(SHUTDOWN_WAIT); /* Now recover the remaining state information */ + if (switch_restore(slurmctld_conf.state_save_location, + recover ? true : false)) + fatal(" failed to initialize switch plugin" ); if ((error_code = read_slurm_conf(recover))) { fatal("read_slurm_conf reading %s: %s", slurmctld_conf.slurm_conf, diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 47aee9840af..df83c9a9591 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -110,6 +110,8 @@ delete_all_step_records (struct job_record *job_ptr) xfree(step_ptr->name); xfree(step_ptr->step_node_list); FREE_NULL_BITMAP(step_ptr->step_node_bitmap); + if (step_ptr->network) + xfree(step_ptr->network); xfree(step_ptr); } @@ -153,6 +155,8 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id) xfree(step_ptr->name); xfree(step_ptr->step_node_list); FREE_NULL_BITMAP(step_ptr->step_node_bitmap); + if (step_ptr->network) + xfree(step_ptr->network); xfree(step_ptr); error_code = 0; break; diff --git a/src/slurmd/slurmstepd/ulimits.c b/src/slurmd/slurmstepd/ulimits.c index 9529f05e833..6e7d2f02af0 100644 --- a/src/slurmd/slurmstepd/ulimits.c +++ b/src/slurmd/slurmstepd/ulimits.c @@ -98,7 +98,7 @@ _set_umask(char **env) return SLURM_ERROR; } - mask = atoi(val); + mask = strtol(val, (char **)NULL, 8); umask(mask); } diff --git a/src/srun/srun.c b/src/srun/srun.c index 8e414e95a10..eac7e904c91 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -718,15 +718,17 @@ _build_script (char *fname, int file_type) /* Set SLURM_UMASK environment variable with current state */ static int _set_umask_env(void) { + char mask_char[5]; mode_t mask = (int)umask(0); umask(mask); - if (setenvf(NULL, "SLURM_UMASK", "%d", (int)mask) < 0) { + sprintf(mask_char, "0%d%d%d", + ((mask>>6)&07), ((mask>>3)&07), mask&07); + if (setenvf(NULL, "SLURM_UMASK", "%s", mask_char) < 0) { error ("unable to set SLURM_UMASK in environment"); return SLURM_FAILURE; } - debug ("propagating UMASK=0%d%d%d", - ((mask>>6)&07), ((mask>>3)&07), mask&07); + debug ("propagating UMASK=%s", mask_char); return SLURM_SUCCESS; } -- GitLab