diff --git a/src/common/switch.c b/src/common/switch.c index 64dafde340853ee54f2437313963d6624d9c70e8..7b5c46234514792a883fe13cf632863bfebec4a1 100644 --- a/src/common/switch.c +++ b/src/common/switch.c @@ -97,6 +97,9 @@ typedef struct slurm_switch_ops { int (*step_allocated) ( switch_jobinfo_t jobinfo, char *nodelist ); int (*state_clear) ( void ); + int (*slurmctld_init) ( void ); + int (*slurmd_init) ( void ); + int (*slurmd_step_init) ( void ); } slurm_switch_ops_t; struct slurm_switch_context { @@ -200,7 +203,10 @@ _slurm_switch_get_ops( slurm_switch_context_t c ) "switch_p_sprintf_node_info", "switch_p_job_step_complete", "switch_p_job_step_allocated", - "switch_p_libstate_clear" + "switch_p_libstate_clear", + "switch_p_slurmctld_init", + "switch_p_slurmd_init", + "switch_p_slurmd_step_init" }; int n_syms = sizeof( syms ) / sizeof( char * ); @@ -551,3 +557,27 @@ extern int switch_g_job_step_allocated(switch_jobinfo_t jobinfo, return (*(g_context->ops.step_allocated))( jobinfo, nodelist ); } + +extern int switch_g_slurmctld_init(void) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.slurmctld_init)) (); +} + +extern int switch_g_slurmd_init(void) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.slurmd_init)) (); +} + +extern int switch_g_slurmd_step_init(void) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.slurmd_step_init)) (); +} diff --git a/src/plugins/switch/elan/switch_elan.c b/src/plugins/switch/elan/switch_elan.c index 68fc06e318b27c72cb2842028699d55e63ed1414..0d8c974fbf15be407ec379d2ff3bbcb3d170c9e0 100644 --- a/src/plugins/switch/elan/switch_elan.c +++ b/src/plugins/switch/elan/switch_elan.c @@ -806,3 +806,18 @@ extern int switch_p_job_step_allocated(switch_jobinfo_t jobinfo, char *nodelist) { return qsw_restore_jobinfo((qsw_jobinfo_t) jobinfo); } + +extern int switch_p_slurmctld_init( void ) +{ + return SLURM_SUCCESS; +} + +extern int switch_p_slurmd_init( void ) +{ + return SLURM_SUCCESS; +} + +extern int switch_p_slurmd_step_init( void ) +{ + return SLURM_SUCCESS; +} diff --git a/src/plugins/switch/federation/federation.c b/src/plugins/switch/federation/federation.c index 3e4e831516f210b49b0a019741e51f0071d17175..c16d3e7dadaac85e138e536aa73d2d740269b014 100644 --- a/src/plugins/switch/federation/federation.c +++ b/src/plugins/switch/federation/federation.c @@ -68,6 +68,7 @@ char* fed_conf = NULL; extern bool fed_need_state_save; mode_t fed_umask; + /* * Data structures specific to Federation * @@ -143,6 +144,9 @@ typedef struct { */ fed_libstate_t *fed_state = NULL; pthread_mutex_t global_lock = PTHREAD_MUTEX_INITIALIZER; + +/* slurmd/slurmd_step global variables */ +hostlist_t adapter_list; static fed_cache_entry_t lid_cache[FED_MAXADAPTERS]; @@ -171,6 +175,8 @@ static void _strip_cr_nl(char *line); static void _strip_comments(char *line); static int _set_up_adapter(fed_adapter_t *fed_adapter, char *adapter_name); static int _parse_fed_file(hostlist_t *adapter_list); +static void _init_adapter_cache(void); +static int _fill_in_adapter_cache(void); /* The _lock() and _unlock() functions are used to lock/unlock a * global mutex. Used to serialize access to the global library @@ -196,6 +202,51 @@ _unlock(void) } } +int +fed_slurmctld_init(void) +{ + return SLURM_SUCCESS; +} + +int +fed_slurmd_init(void) +{ + /* + * This is a work-around for the ntbl_* functions calling umask(0) + */ + fed_umask = umask(0077); + umask(fed_umask); + + /*_init_adapter_cache();*/ + + adapter_list = hostlist_create(NULL); + if (_parse_fed_file(&adapter_list) != SLURM_SUCCESS) + return SLURM_FAILURE; + assert(hostlist_count(adapter_list) <= FED_MAXADAPTERS); + return SLURM_SUCCESS; +} + +int +fed_slurmd_step_init(void) +{ + /* + * This is a work-around for the ntbl_* functions calling umask(0) + */ + fed_umask = umask(0077); + umask(fed_umask); + + _init_adapter_cache(); + + adapter_list = hostlist_create(NULL); + if (_parse_fed_file(&adapter_list) != SLURM_SUCCESS) + return SLURM_FAILURE; + assert(hostlist_count(adapter_list) <= FED_MAXADAPTERS); + + _fill_in_adapter_cache(); + + return SLURM_SUCCESS; +} + static char * _lookup_fed_status_tab(int status) { @@ -258,13 +309,13 @@ char *fed_sprint_jobinfo(fed_jobinfo_t *j, char *buf, /* The lid caching functions were created to avoid unnecessary * function calls each time we need to load network tables on a node. - * fed_init_cache() simply initializes the cache to sane values and + * _init_cache() simply initializes the cache to sane values and * needs to be called before any other cache functions are called. * - * Used by: slurmd + * Used by: slurmd/slurmd_step */ -void -fed_init_cache(void) +static void +_init_adapter_cache(void) { int i; @@ -273,14 +324,44 @@ fed_init_cache(void) lid_cache[i].lid = -1; lid_cache[i].network_id = -1; } +} - /* - * This is a work-around for the ntbl_* functions calling umask(0) - */ - fed_umask = umask(0077); +/* Use ntbl_adapter_resources to cache information about local adapters. + * + * Used by: slurmd_step + */ +static int +_fill_in_adapter_cache(void) +{ + hostlist_iterator_t adapters; + char *adapter_name = NULL; + ADAPTER_RESOURCES res; + int num; + int rc; + int i; + + adapters = hostlist_iterator_create(adapter_list); + for (i = 0; adapter_name = hostlist_next(adapters); i++) { + rc = ntbl_adapter_resources(NTBL_VERSION, adapter_name, &res); + if (rc != NTBL_SUCCESS) + return SLURM_ERROR; + + num = adapter_name[3] - (int)'0'; + assert(num < FED_MAXADAPTERS); + lid_cache[num].lid = res.lid; + lid_cache[num].network_id = res.network_id; + strncpy(lid_cache[num].name, adapter_name, FED_ADAPTERNAME_LEN); + + free(res.window_list); + free(adapter_name); + } + hostlist_iterator_destroy(adapters); umask(fed_umask); + + return SLURM_SUCCESS; } + /* Cache the lid and network_id of a given adapter. Ex: sni0 with lid 10 * gets cached in array index 0 with a lid = 10 and a name = sni0. * @@ -508,43 +589,29 @@ static int _parse_fed_file(hostlist_t *adapter_list) * For all that exist, record vital adapter info plus status for all windows * available on that adapter. Cache lid to adapter name mapping locally. * - * Is not thread-safe. - * * Used by: slurmd */ static int _get_adapters(fed_adapter_t *list, int *count) { - static hostlist_t adapter_list = NULL; - static hostlist_iterator_t adapter_iter; - int i = 0; + hostlist_iterator_t adapter_iter; char *adapter = NULL; + int i; assert(list != NULL); + assert(adapter_list != NULL); - if (adapter_list == NULL || hostlist_is_empty(adapter_list)) { - int rc; - adapter_list = hostlist_create(NULL); - rc = _parse_fed_file(&adapter_list); - if (rc != SLURM_SUCCESS) - return rc; - assert(hostlist_count(adapter_list) <= FED_MAXADAPTERS); - adapter_iter = hostlist_iterator_create(adapter_list); - } - i=0; - *count = hostlist_count(adapter_list); - info("Number of adapters is = %d", *count); - assert(*count > 0); - //list = xmalloc(sizeof(fed_adapter_t) * (*count)); - while (adapter = hostlist_next(adapter_iter)) { - if(_set_up_adapter(list + i, adapter) - == SLURM_ERROR) + adapter_iter = hostlist_iterator_create(adapter_list); + for (i = 0; adapter = hostlist_next(adapter_iter); i++) { + if(_set_up_adapter(list + i, adapter) == SLURM_ERROR) fatal("Failed to set up adapter %s.", adapter); free(adapter); - i++; } - hostlist_iterator_reset(adapter_iter); + hostlist_iterator_destroy(adapter_iter); + assert(i > 0); + *count = i; + info("Number of adapters is = %d", *count); if(!*count) slurm_seterrno_ret(ENOADAPTER); diff --git a/src/plugins/switch/federation/federation.h b/src/plugins/switch/federation/federation.h index 6c8328b2667eac52bfbdebca13af04778c1fb4f4..9370cabef30995b832bd4137db23447f5f70c077 100644 --- a/src/plugins/switch/federation/federation.h +++ b/src/plugins/switch/federation/federation.h @@ -81,7 +81,6 @@ fed_jobinfo_t *fed_copy_jobinfo(fed_jobinfo_t *jp); void fed_free_jobinfo(fed_jobinfo_t *jp); int fed_load_table(fed_jobinfo_t *jp, int uid, int pid); int fed_init(void); -void fed_init_cache(void); int fed_unload_table(fed_jobinfo_t *jp); int fed_unpack_libstate(fed_libstate_t *lp, Buf buffer); int fed_get_jobinfo(fed_jobinfo_t *jp, int key, void *data); diff --git a/src/plugins/switch/federation/switch_federation.c b/src/plugins/switch/federation/switch_federation.c index 5c8652ab6050c7a213726e22e8ee7ba54c35c68d..3c5798f49262068176e412138c69016dfd757385 100644 --- a/src/plugins/switch/federation/switch_federation.c +++ b/src/plugins/switch/federation/switch_federation.c @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/***************************************************************************** \ ** switch_federation.c - Library routines for initiating jobs on IBM ** Federation ** $Id$ @@ -120,7 +120,6 @@ const uint32_t plugin_version = 100; int init ( void ) { verbose("%s loaded", plugin_name); - fed_init_cache(); return SLURM_SUCCESS; } @@ -130,6 +129,21 @@ int fini ( void ) return SLURM_SUCCESS; } +int switch_p_slurmctld_init( void ) +{ + return fed_slurmctld_init(); +} + +int switch_p_slurmd_init( void ) +{ + return fed_slurmd_init(); +} + +int switch_p_slurmd_step_init( void ) +{ + return fed_slurmd_step_init(); +} + /* * switch functions for global state save/restore */ diff --git a/src/plugins/switch/none/switch_none.c b/src/plugins/switch/none/switch_none.c index cdd43fa146ce20b03353ed99a4b10b7736091659..163d3ae583898b89e2c3c22c97eda950fdc2c1d9 100644 --- a/src/plugins/switch/none/switch_none.c +++ b/src/plugins/switch/none/switch_none.c @@ -284,3 +284,17 @@ extern int switch_p_job_step_allocated(switch_jobinfo_t jobinfo, return SLURM_SUCCESS; } +extern int switch_p_slurmctld_init( void ) +{ + return SLURM_SUCCESS; +} + +extern int switch_p_slurmd_init( void ) +{ + return SLURM_SUCCESS; +} + +extern int switch_p_slurmd_step_init( void ) +{ + return SLURM_SUCCESS; +} diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index 0ca1f4921a5841a8d03cb40712b54c103c5cff6d..46ab3f4274e735209e4cbdf15b0ba171c9e9feab 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -195,6 +195,7 @@ main (int argc, char *argv[]) fatal("Unable to initialize interconnect."); if (conf->cleanstart && switch_g_clear_node_state()) fatal("Unable to clear interconnect state."); + switch_g_slurmd_init(); _create_msg_socket(); diff --git a/src/slurmd/slurmd_step.c b/src/slurmd/slurmd_step.c index f108b20267ffb057c65e58a87df422dbe2c2c851..b979e03ff866164dee1bfaad606f38bec16b6de1 100644 --- a/src/slurmd/slurmd_step.c +++ b/src/slurmd/slurmd_step.c @@ -93,6 +93,7 @@ main (int argc, char *argv[]) //log_alter(conf->log_opts, 0, NULL); log_init(argv[0],conf->log_opts, LOG_DAEMON, conf->logfile); g_slurmd_jobacct_init(conf->cf.job_acct_parameters); + switch_g_slurmd_step_init(); /* receive len of packed cli from main slurmd */ if((rc = read(STDIN_FILENO, &len, sizeof(int))) == -1) {