From ccacd390233e075dd6953869ff25366b4198a2fe Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Thu, 15 Oct 2009 17:40:33 +0000 Subject: [PATCH] svn merge -r18877:18906 https://eris.llnl.gov/svn/slurm/branches/slurm-2.0 --- NEWS | 2 ++ .../bluegene/block_allocator/bridge_linker.c | 18 +++++++++++++++--- .../select/bluegene/plugin/bg_block_info.c | 19 +++++++++++++++++++ src/plugins/select/bluegene/plugin/bluegene.c | 17 ++++++++++------- 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index c9a7f83f929..d2b86f9ad95 100644 --- a/NEWS +++ b/NEWS @@ -273,6 +273,8 @@ documents those changes that are of interest to users and admins. -- Add range check for SuspendTime configuration parameter. -- Moved unzipped python-hostname tarball out and the tarball in. -- BLUEGENE - Patched memory leak when running state test. + -- BLUEGENE - fixed slow down generated by slow call rm_get_BG + and polling thread. * Changes in SLURM 2.0.6 ======================== diff --git a/src/plugins/select/bluegene/block_allocator/bridge_linker.c b/src/plugins/select/bluegene/block_allocator/bridge_linker.c index f16d3ca052a..5b386bc44a4 100644 --- a/src/plugins/select/bluegene/block_allocator/bridge_linker.c +++ b/src/plugins/select/bluegene/block_allocator/bridge_linker.c @@ -251,8 +251,9 @@ extern int bridge_fini() { if(handle) dlclose(handle); - - return SLURM_ERROR; + initialized = false; + + return SLURM_SUCCESS; } extern status_t bridge_get_bg(my_bluegene_t **bg) @@ -300,8 +301,19 @@ extern status_t bridge_get_block_info(pm_partition_id_t pid, int rc = CONNECTION_ERROR; if(!bridge_init()) return rc; + + /* this is here to make sure we don't lock up things with + polling and the long running get_BG call */ + rc = pthread_mutex_trylock(&api_file_mutex); + if (rc == EBUSY) + return rc; + else if(rc) { + errno = rc; + error("%s:%d %s: pthread_mutex_trylock(): %m", + __FILE__, __LINE__, __CURRENT_FUNC__); + } - slurm_mutex_lock(&api_file_mutex); + //slurm_mutex_lock(&api_file_mutex); rc = (*(bridge_api.get_partition_info))(pid, partition); slurm_mutex_unlock(&api_file_mutex); return rc; diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c index 76e8809f1e5..d6c4f4e75fb 100644 --- a/src/plugins/select/bluegene/plugin/bg_block_info.c +++ b/src/plugins/select/bluegene/plugin/bg_block_info.c @@ -283,6 +283,16 @@ extern int update_block_list() break; } } + + /* If the call was busy, just skip this + iteration. It usually means something like + rm_get_BG was called which can be a very + long call */ + if(rc == EBUSY) { + debug5("lock was busy, aborting"); + break; + } + error("bridge_get_block_info(%s): %s", name, bg_err_str(rc)); @@ -621,6 +631,15 @@ extern int update_freeing_block_list() break; } } + /* If the call was busy, just skip this + iteration. It usually means something like + rm_get_BG was called which can be a very + long call */ + if(rc == EBUSY) { + debug5("lock was busy, aborting"); + break; + } + error("bridge_get_block_info(%s): %s", name, bg_err_str(rc)); diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 78b8cccadaa..0e766499046 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -41,9 +41,10 @@ #include "defined_block.h" #include <stdio.h> -#define MMCS_POLL_TIME 30 /* poll MMCS for down switches and nodes - * every 120 secs */ -#define BG_POLL_TIME 0 /* poll bg blocks every 3 secs */ +#define MMCS_POLL_TIME 30 /* seconds between poll of MMCS for + * down switches and nodes */ +#define BG_POLL_TIME 1 /* seconds between poll of state + * change in bg blocks */ #define _DEBUG 0 @@ -313,7 +314,6 @@ extern void *block_agent(void *args) last_bg_test = now - BG_POLL_TIME; while (!agent_fini) { - if (difftime(now, last_bg_test) >= BG_POLL_TIME) { if (agent_fini) /* don't bother */ break; /* quit now */ @@ -332,10 +332,10 @@ extern void *block_agent(void *args) "update_block_list 2"); } } - now = time(NULL); } sleep(1); + now = time(NULL); } return NULL; } @@ -356,9 +356,12 @@ extern void *state_agent(void *args) if (agent_fini) /* don't bother */ break; /* quit now */ if(blocks_are_created) { - last_mmcs_test = now; - /* can run for a while */ + /* can run for a while so set the + * time after the call so there is + * always MMCS_POLL_TIME between + * calls */ test_mmcs_failures(); + last_mmcs_test = time(NULL); } } -- GitLab