Skip to content
Snippets Groups Projects
Commit 185705cb authored by Danny Auble's avatar Danny Auble
Browse files

BGQ - if we are shutting down or the realtime server is waiting to start

stop the current poll asap.
parent 075562ca
No related branches found
No related tags found
No related merge requests found
...@@ -757,6 +757,8 @@ static void _do_block_poll(void) ...@@ -757,6 +757,8 @@ static void _do_block_poll(void)
block_ptr->getStatus().toValue()), block_ptr->getStatus().toValue()),
kill_job_list)) kill_job_list))
updated = 1; updated = 1;
if (rt_waiting || slurmctld_config.shutdown_time)
break;
} }
slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&block_state_mutex);
unlock_slurmctld(job_read_lock); unlock_slurmctld(job_read_lock);
...@@ -765,7 +767,6 @@ static void _do_block_poll(void) ...@@ -765,7 +767,6 @@ static void _do_block_poll(void)
if (updated == 1) if (updated == 1)
last_bg_update = time(NULL); last_bg_update = time(NULL);
} }
/* Even though ba_mp should be coming from the main list /* Even though ba_mp should be coming from the main list
...@@ -818,6 +819,9 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq, ...@@ -818,6 +819,9 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq,
slurm_mutex_unlock(&ba_system_mutex); slurm_mutex_unlock(&ba_system_mutex);
slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&block_state_mutex);
unlock_slurmctld(job_read_lock); unlock_slurmctld(job_read_lock);
if (rt_waiting
|| slurmctld_config.shutdown_time)
return;
} }
} }
} }
...@@ -836,18 +840,23 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq, ...@@ -836,18 +840,23 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq,
_handle_bad_nodeboard( _handle_bad_nodeboard(
nb_ptr->getLocation().substr(7,3).c_str(), nb_ptr->getLocation().substr(7,3).c_str(),
bg_down_node, nb_ptr->getState(), NULL, 0); bg_down_node, nb_ptr->getState(), NULL, 0);
if (rt_waiting || slurmctld_config.shutdown_time)
return;
} }
} }
for (dim=Dimension::A; dim<=Dimension::D; dim++) { for (dim=Dimension::A; dim<=Dimension::D; dim++) {
Switch::ConstPtr switch_ptr = bridge_get_switch(mp_ptr, dim); Switch::ConstPtr switch_ptr = bridge_get_switch(mp_ptr, dim);
if (switch_ptr) { if (switch_ptr) {
if (switch_ptr->getState() != Hardware::Available) if (switch_ptr->getState() != Hardware::Available) {
_handle_bad_switch(dim, _handle_bad_switch(dim,
bg_down_node, bg_down_node,
switch_ptr->getState(), switch_ptr->getState(),
1, 0); 1, 0);
else { if (rt_waiting
|| slurmctld_config.shutdown_time)
return;
} else {
Cable::ConstPtr my_cable = Cable::ConstPtr my_cable =
switch_ptr->getCable(); switch_ptr->getCable();
/* Dimensions of length 1 do not have a /* Dimensions of length 1 do not have a
...@@ -866,6 +875,9 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq, ...@@ -866,6 +875,9 @@ static void _handle_midplane_update(ComputeHardware::ConstPtr bgq,
delete_list, 0); delete_list, 0);
slurm_mutex_unlock(&ba_system_mutex); slurm_mutex_unlock(&ba_system_mutex);
slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&block_state_mutex);
if (rt_waiting
|| slurmctld_config.shutdown_time)
return;
} }
} }
} }
...@@ -892,6 +904,8 @@ static void _do_hardware_poll(int level, uint16_t *coords, ...@@ -892,6 +904,8 @@ static void _do_hardware_poll(int level, uint16_t *coords,
coords[level]++) { coords[level]++) {
/* handle the outter dims here */ /* handle the outter dims here */
_do_hardware_poll(level+1, coords, bgqsys); _do_hardware_poll(level+1, coords, bgqsys);
if (rt_waiting || slurmctld_config.shutdown_time)
return;
} }
return; return;
} }
...@@ -929,11 +943,10 @@ static void *_poll(void *no_data) ...@@ -929,11 +943,10 @@ static void *_poll(void *no_data)
} }
//debug("polling taking over, realtime is dead"); //debug("polling taking over, realtime is dead");
curr_time = time(NULL); curr_time = time(NULL);
if (blocks_are_created) if (!rt_waiting && blocks_are_created)
_do_block_poll(); _do_block_poll();
/* only do every 30 seconds */ /* only do every 30 seconds */
if ((curr_time - 30) >= last_ran) { if (!rt_waiting && ((curr_time - 30) >= last_ran)) {
uint16_t coords[SYSTEM_DIMENSIONS]; uint16_t coords[SYSTEM_DIMENSIONS];
_do_hardware_poll(0, coords, _do_hardware_poll(0, coords,
bridge_get_compute_hardware()); bridge_get_compute_hardware());
...@@ -1367,6 +1380,8 @@ extern int bridge_status_fini(void) ...@@ -1367,6 +1380,8 @@ extern int bridge_status_fini(void)
return SLURM_ERROR; return SLURM_ERROR;
bridge_status_inited = false; bridge_status_inited = false;
rt_waiting = 1;
#if defined HAVE_BG_FILES #if defined HAVE_BG_FILES
/* make the rt connection end. */ /* make the rt connection end. */
_bridge_status_disconnect(); _bridge_status_disconnect();
...@@ -1405,7 +1420,9 @@ extern int bridge_status_update_block_list_state(List block_list) ...@@ -1405,7 +1420,9 @@ extern int bridge_status_update_block_list_state(List block_list)
while ((bg_record = (bg_record_t *) list_next(itr))) { while ((bg_record = (bg_record_t *) list_next(itr))) {
BlockFilter filter; BlockFilter filter;
Block::Ptrs vec; Block::Ptrs vec;
if (bg_record->magic != BLOCK_MAGIC) { if (!bridge_status_inited)
break;
else if (bg_record->magic != BLOCK_MAGIC) {
/* block is gone */ /* block is gone */
list_remove(itr); list_remove(itr);
continue; continue;
......
...@@ -58,6 +58,7 @@ ...@@ -58,6 +58,7 @@
* overwritten when linking with the slurmctld. * overwritten when linking with the slurmctld.
*/ */
#if defined (__APPLE__) #if defined (__APPLE__)
slurmctld_config_t slurmctld_config __attribute__((weak_import));
slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import)); slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import));
struct node_record *node_record_table_ptr __attribute__((weak_import)) = NULL; struct node_record *node_record_table_ptr __attribute__((weak_import)) = NULL;
int bg_recover __attribute__((weak_import)) = NOT_FROM_CONTROLLER; int bg_recover __attribute__((weak_import)) = NOT_FROM_CONTROLLER;
...@@ -71,6 +72,7 @@ void *acct_db_conn __attribute__((weak_import)) = NULL; ...@@ -71,6 +72,7 @@ void *acct_db_conn __attribute__((weak_import)) = NULL;
char *slurmctld_cluster_name __attribute__((weak_import)) = NULL; char *slurmctld_cluster_name __attribute__((weak_import)) = NULL;
slurmdb_cluster_rec_t *working_cluster_rec __attribute__((weak_import)) = NULL; slurmdb_cluster_rec_t *working_cluster_rec __attribute__((weak_import)) = NULL;
#else #else
slurmctld_config_t slurmctld_config;
slurm_ctl_conf_t slurmctld_conf; slurm_ctl_conf_t slurmctld_conf;
struct node_record *node_record_table_ptr = NULL; struct node_record *node_record_table_ptr = NULL;
int bg_recover = NOT_FROM_CONTROLLER; int bg_recover = NOT_FROM_CONTROLLER;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment