From ccacd390233e075dd6953869ff25366b4198a2fe Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Thu, 15 Oct 2009 17:40:33 +0000
Subject: [PATCH] svn merge -r18877:18906
 https://eris.llnl.gov/svn/slurm/branches/slurm-2.0

---
 NEWS                                          |  2 ++
 .../bluegene/block_allocator/bridge_linker.c  | 18 +++++++++++++++---
 .../select/bluegene/plugin/bg_block_info.c    | 19 +++++++++++++++++++
 src/plugins/select/bluegene/plugin/bluegene.c | 17 ++++++++++-------
 4 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/NEWS b/NEWS
index c9a7f83f929..d2b86f9ad95 100644
--- a/NEWS
+++ b/NEWS
@@ -273,6 +273,8 @@ documents those changes that are of interest to users and admins.
  -- Add range check for SuspendTime configuration parameter.
  -- Moved unzipped python-hostname tarball out and the tarball in.
  -- BLUEGENE - Patched memory leak when running state test.
+ -- BLUEGENE - fixed slow down generated by slow call rm_get_BG 
+    and polling thread.
 
 * Changes in SLURM 2.0.6
 ========================
diff --git a/src/plugins/select/bluegene/block_allocator/bridge_linker.c b/src/plugins/select/bluegene/block_allocator/bridge_linker.c
index f16d3ca052a..5b386bc44a4 100644
--- a/src/plugins/select/bluegene/block_allocator/bridge_linker.c
+++ b/src/plugins/select/bluegene/block_allocator/bridge_linker.c
@@ -251,8 +251,9 @@ extern int bridge_fini()
 {
 	if(handle)
 		dlclose(handle);
-
-	return SLURM_ERROR;
+	initialized = false;
+	
+	return SLURM_SUCCESS;
 }
 
 extern status_t bridge_get_bg(my_bluegene_t **bg)
@@ -300,8 +301,19 @@ extern status_t bridge_get_block_info(pm_partition_id_t pid,
 	int rc = CONNECTION_ERROR;
 	if(!bridge_init())
 		return rc;
+
+	/* this is here to make sure we don't lock up things with
+	   polling and the long running get_BG call */
+	rc = pthread_mutex_trylock(&api_file_mutex);
+	if (rc == EBUSY) 
+		return rc;
+	else if(rc) {
+		errno = rc;
+		error("%s:%d %s: pthread_mutex_trylock(): %m",
+		      __FILE__, __LINE__, __CURRENT_FUNC__);     
+	}
 	
-	slurm_mutex_lock(&api_file_mutex);
+	//slurm_mutex_lock(&api_file_mutex);
 	rc = (*(bridge_api.get_partition_info))(pid, partition);
 	slurm_mutex_unlock(&api_file_mutex);
 	return rc;
diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c
index 76e8809f1e5..d6c4f4e75fb 100644
--- a/src/plugins/select/bluegene/plugin/bg_block_info.c
+++ b/src/plugins/select/bluegene/plugin/bg_block_info.c
@@ -283,6 +283,16 @@ extern int update_block_list()
 					break;
 				}
 			}
+
+			/* If the call was busy, just skip this
+			   iteration.  It usually means something like
+			   rm_get_BG was called which can be a very
+			   long call */
+			if(rc == EBUSY) {
+				debug5("lock was busy, aborting");
+				break;
+			}
+
 			error("bridge_get_block_info(%s): %s", 
 			      name, 
 			      bg_err_str(rc));
@@ -621,6 +631,15 @@ extern int update_freeing_block_list()
 					break;
 				}
 			}
+			/* If the call was busy, just skip this
+			   iteration.  It usually means something like
+			   rm_get_BG was called which can be a very
+			   long call */
+			if(rc == EBUSY) {
+				debug5("lock was busy, aborting");
+				break;
+			}
+
 			error("bridge_get_block_info(%s): %s", 
 			      name, 
 			      bg_err_str(rc));
diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c
index 78b8cccadaa..0e766499046 100644
--- a/src/plugins/select/bluegene/plugin/bluegene.c
+++ b/src/plugins/select/bluegene/plugin/bluegene.c
@@ -41,9 +41,10 @@
 #include "defined_block.h"
 #include <stdio.h>
 
-#define MMCS_POLL_TIME 30	/* poll MMCS for down switches and nodes 
-				 * every 120 secs */
-#define BG_POLL_TIME 0	        /* poll bg blocks every 3 secs */
+#define MMCS_POLL_TIME 30	/* seconds between poll of MMCS for
+				 * down switches and nodes */
+#define BG_POLL_TIME 1	        /* seconds between poll of state
+				 * change in bg blocks */
 
 #define _DEBUG 0
 
@@ -313,7 +314,6 @@ extern void *block_agent(void *args)
 
 	last_bg_test = now - BG_POLL_TIME;
 	while (!agent_fini) {
-
 		if (difftime(now, last_bg_test) >= BG_POLL_TIME) {
 			if (agent_fini)		/* don't bother */
 				break;	/* quit now */
@@ -332,10 +332,10 @@ extern void *block_agent(void *args)
 						      "update_block_list 2");
 				}
 			}
-			now = time(NULL);
 		}
 		
 		sleep(1);
+		now = time(NULL);
 	}
 	return NULL;
 }
@@ -356,9 +356,12 @@ extern void *state_agent(void *args)
 			if (agent_fini)		/* don't bother */
 				break; 	/* quit now */
 			if(blocks_are_created) {
-				last_mmcs_test = now;
-				/* can run for a while */
+				/* can run for a while so set the
+				 * time after the call so there is
+				 * always MMCS_POLL_TIME between
+				 * calls */
 				test_mmcs_failures();
+				last_mmcs_test = time(NULL);
 			}
 		} 	
 				
-- 
GitLab