diff --git a/NEWS b/NEWS index e6e53964776bc42353fc0ab17a270b4b334e4469..d5dd565431f67d0576d1e71e768bd4bf9f5f9b4a 100644 --- a/NEWS +++ b/NEWS @@ -8,6 +8,13 @@ documents those changes that are of interest to users and admins. slurmctld. -- fixed partition_allocator to work without curses +* Changes in SLURM 0.6.0 +======================== + -- Added SLURM_LOCALID environment variable for spawned tasks + (Dan Palermo, HP). + -- Modify switch logic to restore state based exclusively upon + recovered job steps (not state save file). + * Changes in SLURM 0.6.0-pre6 ============================= -- Added logic to return scheduled nodes to Maui scheduler (David @@ -85,6 +92,8 @@ documents those changes that are of interest to users and admins. -- "Session manager" slurmd process has been eliminated. -- switch/federation fixes migrated from 0.5.* -- srun pthreads really set detached, fixes scaling problem + -- srun spawns message handler process so it can now be stopped (via + Ctrl-Z or TotalView) without inducing failures. * Changes in SLURM 0.5.7 ======================== diff --git a/auxdir/x_ac_ncurses.m4 b/auxdir/x_ac_ncurses.m4 index c647bdb538f9188200891f33016ef0e9d1aedbec..51e62b668e6053b55bc7af98c45c9c309a9ade59 100644 --- a/auxdir/x_ac_ncurses.m4 +++ b/auxdir/x_ac_ncurses.m4 @@ -29,7 +29,7 @@ AC_DEFUN([X_AC_NCURSES], NCURSES="-lcurses" ac_have_some_curses="yes" else - AC_MSG_NOTICE([Can not build smap without curses or ncurses library]) + AC_MSG_ERROR([Can not build slurm without curses or ncurses library]) ac_have_some_curses="no" fi ]) diff --git a/auxdir/x_ac_readline.m4 b/auxdir/x_ac_readline.m4 index 5c3ebaf30ac7122e054d78070f7733ecc3f6b507..1721449c6af5c779389831e71d0d0a185f1392c6 100644 --- a/auxdir/x_ac_readline.m4 +++ b/auxdir/x_ac_readline.m4 @@ -12,7 +12,7 @@ # # # WARNINGS: -# This macro must be placed after AC_PROG_CC or equivalent. +# This macro must be placed after AC_PROG_CC and X_AC_CURSES. ##***************************************************************************** AC_DEFUN([X_AC_READLINE], @@ -28,17 +28,21 @@ AC_DEFUN([X_AC_READLINE], esac ] ) + AC_MSG_RESULT([${ac_with_readline=yes}]) if test "$ac_with_readline" = "yes"; then - savedLIBS="$LIBS" - READLINE_LIBS="-lreadline -lhistory -lncurses" - - AC_CHECK_LIB([readline], [readline], [], - AC_MSG_ERROR([Cannot find libreadline!]), [ -lhistory -lncurses ]) - - AC_DEFINE([HAVE_READLINE], [1], - [Define if you are compiling with readline.]) - LIBS="$savedLIBS" + saved_LIBS="$LIBS" + READLINE_LIBS="-lreadline -lhistory $NCURSES" + LIBS="$saved_LIBS $READLINE_LIBS" + AC_TRY_LINK( + [ #include <stdio.h> + #include <readline/readline.h> + #include <readline/history.h>], [ + char *line = readline("in:");], + [AC_DEFINE([HAVE_READLINE], [1], + [Define if you are compiling with readline.])], + [READLINE_LIBS=""]) + LIBS="$savedLIBS" fi AC_SUBST(READLINE_LIBS) ]) diff --git a/doc/html/team.html b/doc/html/team.html index 71d69cd1cf2d6166dd9abb3a9eb2a8b5c8b41df9..875e7f2d154f5979cbf63f762769bc4af0187019 100644 --- a/doc/html/team.html +++ b/doc/html/team.html @@ -9,7 +9,7 @@ <meta http-equiv="keywords" content="Simple Linux Utility for Resource Management, SLURM, resource management, Linux clusters, high-performance computing, Livermore Computing"> <meta name="LLNLRandR" content="UCRL-WEB-213976"> -<meta name="LLNLRandRdate" content="12 September 2005"> +<meta name="LLNLRandRdate" content="16 September 2005"> <meta name="distribution" content="global"> <meta name="description" content="Simple Linux Utility for Resource Management"> <meta name="copyright" @@ -80,6 +80,7 @@ The current SLURM development staff includes: </p> <li>David Jackson (Cluster Resources)</li> <li>Jason King (LLNL)</li> <li>Bryan O'Sullivan (Pathscale)</li> +<li>Daniel Palermo (HP)</li> <li>Dan Phung (LLNL/Columbia University)</li> <li>Jeff Squyres (LAM MPI)</li> <li>Kevin Tew (LLNL/Bringham Young University)</li> @@ -89,7 +90,7 @@ The current SLURM development staff includes: </p> <td colspan="3"><hr> <p>For information about this page, contact <a href="mailto:slurm-dev@lists.llnl.gov">slurm-dev@lists.llnl.gov</a>.</p> <p><a href="http://www.llnl.gov/"><img align=middle src="lll.gif" width="32" height="32" border="0"></a></p> <p class="footer">UCRL-WEB-213976<br> -Last modified 12 September 2005</p></td> +Last modified 16 September 2005</p></td> </tr> </table> </td> diff --git a/etc/slurm.epilog.clean b/etc/slurm.epilog.clean index f7d1a4c8d452f92a5eb50e91d45546edee39915f..4ae37ae465198be462561aec8bce51ecd5599914 100644 --- a/etc/slurm.epilog.clean +++ b/etc/slurm.epilog.clean @@ -34,3 +34,4 @@ done # No other SLURM jobs, purge all remaining processes of this user # pkill -KILL -U $SLURM_UID +exit 0 diff --git a/src/common/env.c b/src/common/env.c index d68f533d2e7c0e13643267229c8d13f435563405..553f271b98a1d388a2a03bbd90688fa10a89723f 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -311,6 +311,12 @@ int setup_env(env_t *env) rc = SLURM_FAILURE; } + if (env->localid >= 0 + && setenvf(&env->env, "SLURM_LOCALID", "%d", env->localid)) { + error("Unable to set SLURM_LOCALID environment"); + rc = SLURM_FAILURE; + } + if (env->stepid >= 0 && setenvf(&env->env, "SLURM_STEPID", "%d", env->stepid)) { error("Unable to set SLURM_STEPID environment"); diff --git a/src/common/env.h b/src/common/env.h index ac43d09afa933f3b1d45a3b8644e3a49c680983b..57b46e9cb9f02d31d65c5e67c64dc276cca14804 100644 --- a/src/common/env.h +++ b/src/common/env.h @@ -53,7 +53,8 @@ typedef struct env_options { slurm_addr *self; int jobid; /* assigned job id */ int stepid; /* assigned step id */ - int procid; + int procid; /* global task id (across nodes) */ + int localid; /* local task id (within node) */ int nodeid; int gmpi; int cpus_per_task; /* --cpus-per-task=n, -c n */ diff --git a/src/common/switch.c b/src/common/switch.c index 51a4d7ef033df3e342bbc388f45cc496cbaa8f77..64dafde340853ee54f2437313963d6624d9c70e8 100644 --- a/src/common/switch.c +++ b/src/common/switch.c @@ -45,7 +45,8 @@ */ typedef struct slurm_switch_ops { int (*state_save) ( char *dir_name ); - int (*state_restore) ( char *dir_name ); + int (*state_restore) ( char *dir_name, bool recover ); + bool (*no_frag) ( void ); int (*alloc_jobinfo) ( switch_jobinfo_t *jobinfo ); int (*build_jobinfo) ( switch_jobinfo_t jobinfo, @@ -93,6 +94,9 @@ typedef struct slurm_switch_ops { char *buf, size_t size ); int (*step_complete) ( switch_jobinfo_t jobinfo, char *nodelist ); + int (*step_allocated) ( switch_jobinfo_t jobinfo, + char *nodelist ); + int (*state_clear) ( void ); } slurm_switch_ops_t; struct slurm_switch_context { @@ -194,7 +198,9 @@ _slurm_switch_get_ops( slurm_switch_context_t c ) "switch_p_unpack_node_info", "switch_p_free_node_info", "switch_p_sprintf_node_info", - "switch_p_job_step_complete" + "switch_p_job_step_complete", + "switch_p_job_step_allocated", + "switch_p_libstate_clear" }; int n_syms = sizeof( syms ) / sizeof( char * ); @@ -286,12 +292,20 @@ extern int switch_save(char *dir_name) return (*(g_context->ops.state_save))( dir_name ); } -extern int switch_restore(char *dir_name) +extern int switch_restore(char *dir_name, bool recover) { if ( switch_init() < 0 ) return SLURM_ERROR; - return (*(g_context->ops.state_restore))( dir_name ); + return (*(g_context->ops.state_restore))( dir_name, recover ); +} + +extern int switch_clear(void) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.state_clear))( ); } extern bool switch_no_frag(void) @@ -528,3 +542,12 @@ extern int switch_g_job_step_complete(switch_jobinfo_t jobinfo, return (*(g_context->ops.step_complete))( jobinfo, nodelist ); } + +extern int switch_g_job_step_allocated(switch_jobinfo_t jobinfo, + char *nodelist) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.step_allocated))( jobinfo, nodelist ); +} diff --git a/src/common/switch.h b/src/common/switch.h index a694a2a01663ea15cbb6959fd6a63ab0656ec9ca..5b814c2fb95e65baaeed7097af37d15fb5f3932d 100644 --- a/src/common/switch.h +++ b/src/common/switch.h @@ -68,11 +68,17 @@ extern int switch_save (char *dir_name); /* restore any global switch state from a file within the specified directory * the actual file name used in plugin specific - * IN dir_name - directory from hich switch state is restored or NULL for - * switch restart with no state restored + * IN dir_name - directory from hich switch state is restored + * IN recover - "true" to restore switch state, "false" to start with + * a clean slate. * RET - slurm error code */ -extern int switch_restore(char *dir_name); +extern int switch_restore(char *dir_name, bool recover); + +/* clear all current switch window allocation information + * RET - slurm error code + */ +extern int switch_clear(void); /* report if resource fragmentation is important. if so, delay scheduling a * new job while another is in the process of terminating. @@ -155,6 +161,14 @@ extern int switch_g_get_jobinfo(switch_jobinfo_t jobinfo, extern int switch_g_job_step_complete(switch_jobinfo_t jobinfo, char *nodelist); +/* + * Restore the switch allocation information "jobinfo" for an already + * allocated job step, most likely to restore the switch information + * after a call to switch_clear(). + */ +extern int switch_g_job_step_allocated(switch_jobinfo_t jobinfo, + char *nodelist); + /* write job credential string representation to a file * IN fp - an open file pointer * IN jobinfo - a switch job credential diff --git a/src/plugins/select/bluegene/bgl_job_run.c b/src/plugins/select/bluegene/bgl_job_run.c index 16a4de0dfe8e9ef5fc58eb4daa2f7f2e5ff9874a..7f7572ddb2bf42948710a5091714e9ef8d37e4e3 100644 --- a/src/plugins/select/bluegene/bgl_job_run.c +++ b/src/plugins/select/bluegene/bgl_job_run.c @@ -349,7 +349,8 @@ static void _term_agent(bgl_update_t *bgl_update_ptr) time_t now; struct tm *time_ptr; char reason[128]; - + int job_remove_failed = 0; + debug2("getting the job info"); live_states = JOB_ALL_FLAG & (~JOB_TERMINATED_FLAG) @@ -407,15 +408,7 @@ static void _term_agent(bgl_update_t *bgl_update_ptr) } debug2("got job_id %d",job_id); if((rc = _remove_job(job_id)) == INTERNAL_ERROR) { - now = time(NULL); - time_ptr = localtime(&now); - strftime(reason, sizeof(reason), - "_term_agent: " - "Couldn't remove job " - "[SLURM@%b %d %H:%M]", - time_ptr); - slurm_drain_nodes(bgl_record->nodes, - reason); + job_remove_failed = 1; break; } } @@ -426,10 +419,25 @@ static void _term_agent(bgl_update_t *bgl_update_ptr) debug2("got the record %s user is %s", bgl_record->bgl_part_id, bgl_record->user_name); - + + if(job_remove_failed) { + time_ptr = localtime(&now); + strftime(reason, sizeof(reason), + "_term_agent: " + "Couldn't remove job " + "[SLURM@%b %d %H:%M]", + time_ptr); + if(bgl_record->nodes) + slurm_drain_nodes(bgl_record->nodes, + reason); + else + error("Partition %s doesn't have a node list.", + bgl_update_ptr->bgl_part_id); + } + slurm_mutex_lock(&part_state_mutex); bgl_record->job_running = 0; - + /*remove user from list */ if(bgl_record->target_name) { if(strcmp(bgl_record->target_name, diff --git a/src/plugins/switch/elan/qsw.c b/src/plugins/switch/elan/qsw.c index 44eaf73fc58a8ad454d02c07ddafaf60308284bf..e4f2677146fb009ba90d601718bb15d17bc696f3 100644 --- a/src/plugins/switch/elan/qsw.c +++ b/src/plugins/switch/elan/qsw.c @@ -1,5 +1,6 @@ /*****************************************************************************\ * qsw.c - Library routines for initiating jobs on QsNet. + * $Id$ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -24,7 +25,7 @@ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. \*****************************************************************************/ -#if HAVE_CONFIG_H +#ifdef HAVE_CONFIG_H # include "config.h" #endif @@ -408,6 +409,28 @@ qsw_fini(qsw_libstate_t savestate) _unlock_qsw(); } +int +qsw_clear(void) +{ + int rc = 0; + + _lock_qsw(); + assert(qsw_internal_state->ls_magic == QSW_LIBSTATE_MAGIC); + if (qsw_internal_state->step_ctx_list) + list_destroy(qsw_internal_state->step_ctx_list); + qsw_internal_state->step_ctx_list = list_create(_step_ctx_del); + if (elanconf) + elanhost_config_destroy(elanconf); + if (!(elanconf = elanhost_config_create ())) { + rc = -1; + goto done; + } + qsw_internal_state->ls_prognum = QSW_PRG_START + + (lrand48() % (QSW_PRG_END - QSW_PRG_START + 1)); +done: _unlock_qsw(); + return rc; +} + /* * Allocate a qsw_jobinfo_t. * jp (IN) store pointer to new instantiation here @@ -661,6 +684,43 @@ _alloc_hwcontext(bitstr_t *nodeset, uint32_t prognum, int num) return new; } +extern int qsw_restore_jobinfo(struct qsw_jobinfo *jobinfo) +{ + struct step_ctx *step_ctx_p; + ListIterator iter; + int duplicate = 0; + + assert(qsw_internal_state); + if (!jobinfo) + return 0; + + assert(jobinfo->j_magic == QSW_JOBINFO_MAGIC); + _lock_qsw(); + + /* check for duplicate */ + iter = list_iterator_create(qsw_internal_state->step_ctx_list); + while ((step_ctx_p = list_next(iter))) { + if (jobinfo->j_prognum == step_ctx_p->st_prognum) { + duplicate = 1; + break; + } + } + list_iterator_destroy(iter); + if (!duplicate) { /* need new record */ + step_ctx_p = xmalloc(sizeof(struct step_ctx)); + step_ctx_p->st_prognum = jobinfo->j_prognum; + } + step_ctx_p->st_low = jobinfo->j_cap.LowContext - QSW_CTX_START; + step_ctx_p->st_high = jobinfo->j_cap.HighContext - QSW_CTX_START; + step_ctx_p->st_low_node = jobinfo->j_cap.LowNode; + step_ctx_p->st_high_node = jobinfo->j_cap.HighNode; + _dump_step_ctx("qsw_restore_jobinfo", step_ctx_p); + if (!duplicate) + list_push(qsw_internal_state->step_ctx_list, step_ctx_p); + _unlock_qsw(); + return 0; +} + static void _free_hwcontext(uint32_t prog_num) { diff --git a/src/plugins/switch/elan/switch_elan.c b/src/plugins/switch/elan/switch_elan.c index 80385ca569f99a787c00c3a0849b2975d10c1105..68fc06e318b27c72cb2842028699d55e63ed1414 100644 --- a/src/plugins/switch/elan/switch_elan.c +++ b/src/plugins/switch/elan/switch_elan.c @@ -214,7 +214,7 @@ int switch_p_libstate_save (char *dir_name) return error_code; } -int switch_p_libstate_restore (char *dir_name) +int switch_p_libstate_restore (char *dir_name, bool recover) { char *data = NULL, *file_name; qsw_libstate_t old_state = NULL; @@ -224,7 +224,7 @@ int switch_p_libstate_restore (char *dir_name) char *ver_str = NULL; uint16_t ver_str_len; - if (dir_name == NULL) /* clean start, no recovery */ + if (!recover) /* clean start, no recovery */ return qsw_init(NULL); file_name = xstrdup(dir_name); @@ -293,6 +293,12 @@ int switch_p_libstate_restore (char *dir_name) return error_code; } +int switch_p_libstate_clear ( void ) +{ + return qsw_clear(); +} + + bool switch_p_no_frag ( void ) { return true; @@ -795,3 +801,8 @@ extern int switch_p_job_step_complete(switch_jobinfo_t jobinfo, return SLURM_SUCCESS; } + +extern int switch_p_job_step_allocated(switch_jobinfo_t jobinfo, char *nodelist) +{ + return qsw_restore_jobinfo((qsw_jobinfo_t) jobinfo); +} diff --git a/src/plugins/switch/federation/federation.c b/src/plugins/switch/federation/federation.c index 409efd442c8a00f618136a159ea54a35c65f4fc5..206831578cf6b864b7d1fb97e1ec282027f9c3f2 100644 --- a/src/plugins/switch/federation/federation.c +++ b/src/plugins/switch/federation/federation.c @@ -65,6 +65,7 @@ #define BUFSIZE 4096 char* fed_conf = NULL; +extern bool fed_need_state_save; mode_t fed_umask; /* @@ -1044,6 +1045,8 @@ _alloc_node(fed_libstate_t *lp, char *name) return n; } + fed_need_state_save = true; + if(lp->node_count >= lp->node_max) { lp->node_max += FED_NODECOUNT; new_bufsize = lp->node_max * sizeof(fed_nodeinfo_t); @@ -1395,14 +1398,14 @@ _allocate_windows(int adapter_cnt, fed_tableinfo_t *tableinfo, } -/* Find the correct NTBL structs and remove the allocation - * of adapters, lids and switch windows for the specified task_id. +/* Find the correct NTBL structs and set the state + * of the switch windows for the specified task_id. * * Used by: slurmctld */ static int -_deallocate_windows(int adapter_cnt, fed_tableinfo_t *tableinfo, - char *hostname, int task_id) +_window_state_set(int adapter_cnt, fed_tableinfo_t *tableinfo, + char *hostname, int task_id, enum NTBL_RC state) { fed_nodeinfo_t *node; fed_adapter_t *adapter; @@ -1424,7 +1427,6 @@ _deallocate_windows(int adapter_cnt, fed_tableinfo_t *tableinfo, return SLURM_ERROR; } - /* Reserve a window on each adapter for this task */ for (i = 0; i < adapter_cnt; i++) { adapter = &node->adapter_list[i]; if (tableinfo[i].table == NULL) { @@ -1437,15 +1439,20 @@ _deallocate_windows(int adapter_cnt, fed_tableinfo_t *tableinfo, return SLURM_ERROR; } if (adapter->lid != table->lid) { - error("Did not find the correct adapter: %hu vs. %hu", - adapter->lid, table->lid); + if (table->lid != 0) + error("Did not find the correct adapter: " + "%hu vs. %hu", + adapter->lid, table->lid); return SLURM_ERROR; } - debug3("Clearing adapter %s, lid %hu, window %hu for task %d", - adapter->name, table->lid, table->window_id, task_id); + debug3("Setting status %s adapter %s, " + "lid %hu, window %hu for task %d", + state == NTBL_UNLOADED_STATE ? "UNLOADED" : "LOADED", + adapter->name, + table->lid, table->window_id, task_id); window = _find_window(adapter, table->window_id); if (window) - window->status = NTBL_UNLOADED_STATE; + window->status = state; } return SLURM_SUCCESS; @@ -1487,15 +1494,16 @@ _print_index(char *index, int size) } printf("--End lid index--\n"); } -#endif +#endif -/* Find mark as free all of the windows used by this job step. +/* Find all of the windows used by this job step and set their + * status to "state". * * Used by: slurmctld */ -int -fed_job_step_complete(fed_jobinfo_t *jp, hostlist_t hl) +static int +_job_step_window_state(fed_jobinfo_t *jp, hostlist_t hl, enum NTBL_RC state) { hostlist_iterator_t hi; char *host; @@ -1505,11 +1513,19 @@ fed_job_step_complete(fed_jobinfo_t *jp, hostlist_t hl) int i, j; int rc; + xassert(!hostlist_is_empty(hl)); + xassert(jp); + xassert(jp->magic == FED_JOBINFO_MAGIC); + if ((jp == NULL) || (jp->magic != FED_JOBINFO_MAGIC) || (hostlist_is_empty(hl))) return SLURM_ERROR; + xassert(jp->tables_per_task); + xassert(jp->tableinfo); + xassert(jp->tableinfo[0].table_length); + if ((jp->tables_per_task == 0) || !jp->tableinfo || (jp->tableinfo[0].table_length == 0)) @@ -1527,9 +1543,10 @@ fed_job_step_complete(fed_jobinfo_t *jp, hostlist_t hl) hostlist_iterator_reset(hi); host = hostlist_next(hi); } - rc = _deallocate_windows(jp->tables_per_task, - jp->tableinfo, - host, proc_cnt); + rc = _window_state_set(jp->tables_per_task, + jp->tableinfo, + host, proc_cnt, + state); free(host); } _unlock(); @@ -1558,9 +1575,10 @@ fed_job_step_complete(fed_jobinfo_t *jp, hostlist_t hl) task_cnt = min_procs_per_node; for (j = 0; j < task_cnt; j++) { - rc = _deallocate_windows(jp->tables_per_task, - jp->tableinfo, - host, proc_cnt); + rc = _window_state_set(jp->tables_per_task, + jp->tableinfo, + host, proc_cnt, + state); proc_cnt++; } free(host); @@ -1572,6 +1590,33 @@ fed_job_step_complete(fed_jobinfo_t *jp, hostlist_t hl) return SLURM_SUCCESS; } +/* Find all of the windows used by job step "jp" and mark their + * state NTBL_UNLOADED_STATE. + * + * Used by: slurmctld + */ +int +fed_job_step_complete(fed_jobinfo_t *jp, hostlist_t hl) +{ + return _job_step_window_state(jp, hl, NTBL_UNLOADED_STATE); +} + + +/* Find all of the windows used by job step "jp" and mark their + * state NTBL_LOADED_STATE. + * + * Used by the slurmctld at startup time to restore the allocation + * status of any job steps that were running at the time the previous + * slurmctld was shutdown. Also used to restore teh allocation + * status after a call to switch_clear(). + */ +int +fed_job_step_allocated(fed_jobinfo_t *jp, hostlist_t hl) +{ + return _job_step_window_state(jp, hl, NTBL_LOADED_STATE); +} + + /* Setup everything for the job. Assign tasks across * nodes in a block or cyclic fashion and create the network table used @@ -2326,7 +2371,7 @@ fed_libstate_save(Buf buffer) { _lock(); _pack_libstate(fed_state, buffer); - _free_libstate(fed_state); + /*_free_libstate(fed_state);*/ _unlock(); } @@ -2345,13 +2390,14 @@ _unpack_libstate(fed_libstate_t *lp, Buf buffer) safe_unpack32(&lp->magic, buffer); safe_unpack32(&node_count, buffer); for(i = 0; i < node_count; i++) - (void)_unpack_nodeinfo(NULL, buffer, true); + (void)_unpack_nodeinfo(NULL, buffer, false); assert(lp->node_count == node_count); safe_unpack16(&lp->key_index, buffer); return SLURM_SUCCESS; unpack_error: + error("unpack error in _unpack_libstate"); slurm_seterrno_ret(EBADMAGIC_FEDLIBSTATE); return SLURM_ERROR; } @@ -2366,11 +2412,44 @@ fed_libstate_restore(Buf buffer) assert(!fed_state); fed_state = _alloc_libstate(); - if(!fed_state) + if(!fed_state) { + error("fed_libstate_restore fed_state is NULL"); return SLURM_FAILURE; + } _unpack_libstate(fed_state, buffer); _unlock(); return SLURM_SUCCESS; } +int +fed_libstate_clear(void) +{ + int i, j, k; + struct fed_nodeinfo *node; + struct fed_adapter *adapter; + struct fed_window *window; + + debug3("Clearing state on all windows in global fed state"); + _lock(); + if (!fed_state || !fed_state->node_list) + return SLURM_ERROR; + + for (i = 0; i < fed_state->node_count; i++) { + node = &fed_state->node_list[i]; + if (!node->adapter_list) + continue; + for (j = 0; j < node->adapter_count; j++) { + adapter = &node->adapter_list[i]; + if (!adapter->window_list) + continue; + for (k = 0; k < adapter->window_count; k++) { + window = &adapter->window_list[k]; + window->status = NTBL_UNLOADED_STATE; + } + } + } + _unlock(); + + return SLURM_SUCCESS; +} diff --git a/src/plugins/switch/federation/federation.h b/src/plugins/switch/federation/federation.h index ae92abcc72edded50526971a0dd3b6a9aba51232..6c8328b2667eac52bfbdebca13af04778c1fb4f4 100644 --- a/src/plugins/switch/federation/federation.h +++ b/src/plugins/switch/federation/federation.h @@ -87,5 +87,8 @@ int fed_unpack_libstate(fed_libstate_t *lp, Buf buffer); int fed_get_jobinfo(fed_jobinfo_t *jp, int key, void *data); void fed_libstate_save(Buf buffer); int fed_libstate_restore(Buf buffer); +int fed_job_step_complete(fed_jobinfo_t *jp, hostlist_t hl); +int fed_job_step_allocated(fed_jobinfo_t *jp, hostlist_t hl); +int fed_libstate_clear(void); #endif /* _FEDERATION_INCLUDED */ diff --git a/src/plugins/switch/federation/switch_federation.c b/src/plugins/switch/federation/switch_federation.c index 091e410c95f142d7e1fb2f27858ba09ceedc9776..5c8652ab6050c7a213726e22e8ee7ba54c35c68d 100644 --- a/src/plugins/switch/federation/switch_federation.c +++ b/src/plugins/switch/federation/switch_federation.c @@ -40,6 +40,10 @@ #define BUF_SIZE 1024 +bool fed_need_state_save = false; + +static void _spawn_state_save_thread(char *dir); + /* Type for error string table entries */ typedef struct { int xe_number; @@ -174,14 +178,25 @@ int switch_p_libstate_save ( char * dir_name ) return ret; } -int switch_p_libstate_restore ( char * dir_name ) + +/* + * Restore global nodeinfo from a file. + * + * NOTE: switch_p_libstate_restore is only called by slurmctld, and only + * once at start-up. We exploit (abuse?) this fact to spawn a pthread to + * periodically call switch_p_libstate_save(). + */ +int switch_p_libstate_restore ( char * dir_name, bool recover ) { char *data = NULL, *file_name; Buf buffer = NULL; int error_code = SLURM_SUCCESS; int state_fd, data_allocated = 0, data_read = 0, data_size = 0; - if (dir_name == NULL) /* clean start, no recovery */ + xassert(dir_name != NULL); + + _spawn_state_save_thread(xstrdup(dir_name)); + if (!recover) /* clean start, no recovery */ return fed_init(); file_name = xstrdup(dir_name); @@ -228,6 +243,11 @@ int switch_p_libstate_restore ( char * dir_name ) return error_code; } +int switch_p_libstate_clear(void) +{ + return fed_libstate_clear(); +} + /* * switch state monitoring functions */ @@ -399,6 +419,18 @@ int switch_p_job_step_complete(switch_jobinfo_t jobinfo, char *nodelist) return rc; } +int switch_p_job_step_allocated(switch_jobinfo_t jobinfo, char *nodelist) +{ + hostlist_t list = NULL; + int rc; + + list = hostlist_create(nodelist); + rc = fed_job_step_allocated((fed_jobinfo_t *)jobinfo, list); + hostlist_destroy(list); + + return rc; +} + void switch_p_print_jobinfo(FILE *fp, switch_jobinfo_t jobinfo) { return; @@ -531,3 +563,28 @@ char *switch_p_strerror(int errnum) char *res = _lookup_slurm_api_errtab(errnum); return (res ? res : strerror(errnum)); } + + +static void *_state_save_thread(void *arg) +{ + char *dir_name = (char *)arg; + + while (1) { + sleep(300); + if (fed_need_state_save) { + fed_need_state_save = false; + switch_p_libstate_save(dir_name); + } + } +} + +static void _spawn_state_save_thread(char *dir) +{ + pthread_attr_t attr; + pthread_t id; + + slurm_attr_init(&attr); + + if (pthread_create(&id, &attr, &_state_save_thread, (void *)dir) != 0) + error("Could not start federation state saving pthread"); +} diff --git a/src/plugins/switch/none/switch_none.c b/src/plugins/switch/none/switch_none.c index 2faab3f68f9fef4aade2559c1ffac923b9c56ee4..ca74c8d06bc6404dc831411d9701987da6617ad6 100644 --- a/src/plugins/switch/none/switch_none.c +++ b/src/plugins/switch/none/switch_none.c @@ -88,7 +88,12 @@ int switch_p_libstate_save ( char * dir_name ) return SLURM_SUCCESS; } -int switch_p_libstate_restore ( char * dir_name ) +int switch_p_libstate_restore ( char * dir_name, bool recover ) +{ + return SLURM_SUCCESS; +} + +int switch_p_libstate_clear ( void ) { return SLURM_SUCCESS; } @@ -273,3 +278,9 @@ extern int switch_p_job_step_complete(switch_jobinfo_t jobinfo, return SLURM_SUCCESS; } +extern int switch_p_job_step_allocated(switch_jobinfo_t jobinfo, + char *nodelist) +{ + return SLURM_SUCCESS; +} + diff --git a/src/slurmctld/backup.c b/src/slurmctld/backup.c index 8058cecf1e67214dcba2b97d712572495d246f0a..b1cc7dc67b26e7c4b14948574237f0c7a92cdd7e 100644 --- a/src/slurmctld/backup.c +++ b/src/slurmctld/backup.c @@ -46,6 +46,7 @@ #include "src/common/log.h" #include "src/common/macros.h" #include "src/common/slurm_auth.h" +#include "src/common/switch.h" #include "src/common/xsignal.h" #include "src/common/xstring.h" @@ -158,6 +159,7 @@ void run_backup(void) /* clear old state and read new state */ job_fini(); + switch_clear(); if (read_slurm_conf(2)) { /* Recover all state */ error("Unable to recover slurm state"); abort(); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 8028bc9a840731a70604e76534caa940fdc60bfc..8746b88b452755f68661caff0fb0fc456fc59d73 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -239,6 +239,10 @@ int main(int argc, char *argv[]) if ( checkpoint_init(slurmctld_conf.checkpoint_type) != SLURM_SUCCESS ) fatal( "failed to initialize checkpoint plugin" ); + error_code = switch_restore(slurmctld_conf.state_save_location, + recover ? true : false); + if ( error_code != 0) + fatal(" failed to initialize switch plugin" ); if (select_g_state_restore(slurmctld_conf.state_save_location)) fatal( "failed to restore node selection plugin state"); @@ -270,16 +274,12 @@ int main(int argc, char *argv[]) } info("Running as primary controller"); - /* Recover node scheduler and switch state info */ + /* Recover node scheduler state info */ if (select_g_state_restore(slurmctld_conf.state_save_location) != SLURM_SUCCESS ) { error("failed to restore node selection state"); abort(); } - if (switch_state_begin(recover)) { - error("switch_state_begin: %m"); - abort(); - } /* * create attached thread to process RPCs @@ -323,7 +323,7 @@ int main(int argc, char *argv[]) if (select_g_state_save(slurmctld_conf.state_save_location) != SLURM_SUCCESS ) error("failed to restore node selection state"); - switch_state_fini(); + switch_save(slurmctld_conf.state_save_location); if (slurmctld_config.resume_backup == false) break; } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index bd938d1544497c34f4d1e647134d827228445b51..3d74fc4f3edccfe7453e40d9c99e4e641fee98b1 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -894,7 +894,8 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer) step_node_list = NULL; /* re-used, nothing left to free */ step_ptr->time_last_active = time(NULL); step_ptr->switch_job = switch_tmp; - step_ptr->check_job = check_tmp; + step_ptr->check_job = check_tmp; + switch_g_job_step_allocated(switch_tmp, step_ptr->step_node_list); info("recovered job step %u.%u", job_ptr->job_id, step_id); return SLURM_SUCCESS; diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 451ec71baea5616cdc908bafa051ce710b66b7f7..5fb3e1038e4b2e16f3ead7155b5df9c01c33938f 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -1168,28 +1168,3 @@ static void _validate_node_proc_count(void) list_iterator_destroy(part_iterator); } #endif - -/* - * switch_state_begin - Recover or initialize switch state - * IN recover - If set, recover switch state as previously saved - * RET 0 if no error, otherwise an error code - * Don't need slurmctld_conf lock as nothing else is running to update value - */ -int switch_state_begin(int recover) -{ - if (recover) - return switch_restore(slurmctld_conf.state_save_location); - else - return switch_restore(NULL); -} - -/* - * switch_state_fini - save switch state and shutdown switch - * RET 0 if no error, otherwise an error code - * Don't need slurmctld_conf lock as nothing else is running to update value - */ -int switch_state_fini(void) -{ - return switch_save(slurmctld_conf.state_save_location); -} - diff --git a/src/slurmctld/read_config.h b/src/slurmctld/read_config.h index 32aa52ab0ccf32737a6cc1f829ce9058c2e5191b..f28671ee6128869be15f14adc5951561e4e33bd4 100644 --- a/src/slurmctld/read_config.h +++ b/src/slurmctld/read_config.h @@ -42,17 +42,4 @@ */ extern int read_slurm_conf(int recover); -/* - * switch_state_begin - Recover or initialize switch state - * IN recover - If set, recover switch state as previously saved - * RET 0 if no error, otherwise an error code - */ -extern int switch_state_begin(int recover); - -/* - * switch_state_fini - save switch state and shutdown switch - * RET 0 if no error, otherwise an error code - */ -extern int switch_state_fini(void); - #endif /* !_HAVE_READ_CONFIG_H */ diff --git a/src/slurmd/slurmd_job.c b/src/slurmd/slurmd_job.c index ab58a776d8754e64ea5a4baf381af7bd7e39fe29..c9b587af618bfb44174d6443f255bad3689fe416 100644 --- a/src/slurmd/slurmd_job.c +++ b/src/slurmd/slurmd_job.c @@ -178,6 +178,7 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) job->envtp->stepid = -1; job->envtp->gmpi = -1; job->envtp->procid = -1; + job->envtp->localid = -1; job->envtp->nodeid = -1; memcpy(&resp_addr, cli_addr, sizeof(slurm_addr)); @@ -250,6 +251,7 @@ job_spawn_create(spawn_task_request_msg_t *msg, slurm_addr *cli_addr) job->envtp->stepid = -1; job->envtp->gmpi = -1; job->envtp->procid = -1; + job->envtp->localid = -1; job->envtp->nodeid = -1; memcpy(&io_addr, cli_addr, sizeof(slurm_addr)); @@ -332,6 +334,7 @@ job_batch_job_create(batch_job_launch_msg_t *msg) job->envtp->stepid = -1; job->envtp->gmpi = -1; job->envtp->procid = -1; + job->envtp->localid = -1; job->envtp->nodeid = -1; srun = srun_info_create(NULL, NULL, NULL); diff --git a/src/slurmd/task.c b/src/slurmd/task.c index fd4058d324fe99a10f014368134e1f5006125d3c..d946ca9f4da7174651387b689546891075b49a38 100644 --- a/src/slurmd/task.c +++ b/src/slurmd/task.c @@ -170,6 +170,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd) t = job->task[i]; job->envtp->procid = t->gtid; + job->envtp->localid = t->id; job->envtp->gmpi = getenvp(job->env, "SLURM_GMPI") ? t->gtid : -1; diff --git a/src/srun/srun.c b/src/srun/srun.c index 4d6ea5253dcfd3a5fe7b02503beec7588bed350a..c93f63fc81fec89c5aaf698edd3f4c868e7cd17e 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -126,6 +126,7 @@ int srun(int ac, char **av) env->stepid = -1; env->gmpi = -1; env->procid = -1; + env->localid = -1; env->nodeid = -1; env->cli = NULL; env->env = NULL;