From 64542be0340cf2ba813f96debb80e340f9b0a788 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 25 Oct 2006 23:00:04 +0000
Subject: [PATCH] svn merge -r9908:9923
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1

---
 NEWS                                          |   6 +
 doc/man/man5/wiki.conf.5                      |   6 +-
 .../slurm_protocol_socket_implementation.c    |  23 +++-
 src/plugins/sched/wiki2/event.c               |   8 +-
 src/plugins/sched/wiki2/msg.c                 |   4 +
 .../select/bluegene/plugin/bg_block_info.c    |  47 +++++++-
 .../select/bluegene/plugin/bg_job_run.c       |   7 +-
 .../bluegene/plugin/bg_switch_connections.c   |  37 ++++--
 src/plugins/select/bluegene/plugin/bluegene.c | 106 ++++++++++++++++--
 src/plugins/select/bluegene/plugin/bluegene.h |   2 +-
 src/slurmctld/controller.c                    |   8 +-
 testsuite/expect/test9.7.bash                 |  19 ++--
 12 files changed, 227 insertions(+), 46 deletions(-)

diff --git a/NEWS b/NEWS
index c63efe418a8..9b386ff3ba5 100644
--- a/NEWS
+++ b/NEWS
@@ -89,6 +89,12 @@ documents those changes that are of interest to users and admins.
  - In sched/wiki2, fix memory management bug for JOBWILLRUN command.
  - In sched/wiki2, consider job Busy while in Completing state for 
    KillWait+10 seconds (used to be 30 seconds).
+ - BLUEGENE - Fixes to allow full block creation on the system and not to add
+   passthrough nodes to the allocation when creating a block. 
+ - BLUEGENE - Fix deadlock issue with starting and failing jobs at the same
+   time
+ - Make connect() non-blocking and poll to avoid possibly very long default 
+   timeout.
 
 * Changes in SLURM 1.1.17
 =========================
diff --git a/doc/man/man5/wiki.conf.5 b/doc/man/man5/wiki.conf.5
index 772ff3c0f46..8c311088f38 100644
--- a/doc/man/man5/wiki.conf.5
+++ b/doc/man/man5/wiki.conf.5
@@ -25,15 +25,15 @@ This numeric value should match KEY configured in the
 \fBEHost\fR
 Name the computer on which Moab server executes.
 It is used in establishing a communications path for event notification. 
-By default the \fBEHost\fR will be identical in value to the 
+By default \fBEHost\fR will be identical in value to the 
 \fBControlAddr\fR configured in slurm.conf.
 
 .TP
 \fBEHostBackup\fR
 Name the computer on which the backup Moab server executes.
 It is used in establishing a communications path for event notification.
-There is no default value for \fBEHostBackup\fR (no backup 
-controller is configured).
+By default \fBEHostBackup\fR will be identical in value to the
+\fBBackupAddr\fR configured in slurm.conf.
 
 .TP
 \fBEPort\fR
diff --git a/src/common/slurm_protocol_socket_implementation.c b/src/common/slurm_protocol_socket_implementation.c
index a179ac0733f..c6b0980d0e1 100644
--- a/src/common/slurm_protocol_socket_implementation.c
+++ b/src/common/slurm_protocol_socket_implementation.c
@@ -565,7 +565,28 @@ extern int _slurm_getsockname (int __fd, struct sockaddr * __addr,
 extern int _slurm_connect (int __fd, struct sockaddr const * __addr, 
                                 socklen_t __len)
 {
-        return connect ( __fd , __addr , __len ) ;
+	/* From "man connect": Note that for IP sockets the timeout
+	 * may be very long when syncookies are enabled on the server.
+	 *
+	 * Timeouts in excess of 3 minutes have been observed, resulting
+	 * in serious problems for slurmctld. Making the connect call 
+	 * non-blocking and polling seems to fix the problem. */
+	int rc = -1, flags;
+
+	flags = fcntl(__fd, F_GETFL);
+	fcntl(__fd, F_SETFL, flags | O_NONBLOCK);
+	rc = connect ( __fd , __addr , __len ) ;
+	if ((rc == -1)
+	&&  ((errno == EINPROGRESS) || (errno == EALREADY))) {
+		struct pollfd ufds;
+		ufds.fd = __fd;
+		ufds.events = POLLOUT;
+		ufds. revents = 0;
+		poll(&ufds, 1, 5000);   /* 5 sec max wait */
+		rc = connect ( __fd , __addr , __len ) ;
+	}
+	fcntl(__fd, F_SETFL, flags);
+	return rc;
 }
 
 /* Put the address of the peer connected to socket FD into *ADDR
diff --git a/src/plugins/sched/wiki2/event.c b/src/plugins/sched/wiki2/event.c
index 86b9a65bcc9..39d4eb0efc4 100644
--- a/src/plugins/sched/wiki2/event.c
+++ b/src/plugins/sched/wiki2/event.c
@@ -77,8 +77,11 @@ extern int	event_notify(char *msg)
 		}
 	}
 	event_fd = slurm_open_msg_conn(&moab_event_addr);
-	if ((event_fd == -1) && (event_addr_set == 2))
+	if ((event_fd == -1) && (event_addr_set == 2)) {
+		debug("Unable to open wiki event port %s:%u: %m",
+			e_host, e_port);
 		event_fd = slurm_open_msg_conn(&moab_event_addr_bu);
+	}
 	if (event_fd == -1) {
 		char *host_name;
 		if (event_addr_set == 2)
@@ -88,6 +91,9 @@ extern int	event_notify(char *msg)
 		error("Unable to open wiki event port %s:%u: %m", 
 			host_name, e_port);
 		pthread_mutex_unlock(&event_mutex);
+		/* Don't retry again for a while (10 mins)
+		 * to avoid long delays from ETIMEDOUT */
+		last_notify_time = now + 600;
 		return -1;
 	}
 
diff --git a/src/plugins/sched/wiki2/msg.c b/src/plugins/sched/wiki2/msg.c
index 34d1f92dd66..c6a151edb23 100644
--- a/src/plugins/sched/wiki2/msg.c
+++ b/src/plugins/sched/wiki2/msg.c
@@ -211,6 +211,10 @@ static void _parse_wiki_config(void)
 	/* Set default values */
 	conf = slurm_conf_lock();
 	strncpy(e_host, conf->control_addr, sizeof(e_host));
+	if (conf->backup_addr) {
+		strncpy(e_host_bu, conf->backup_addr,
+			sizeof(e_host));
+	} 
 	kill_wait = conf->kill_wait;
 	slurm_conf_unlock();
 
diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c
index 9878b3f0fb8..f2103a91b9a 100644
--- a/src/plugins/select/bluegene/plugin/bg_block_info.c
+++ b/src/plugins/select/bluegene/plugin/bg_block_info.c
@@ -70,7 +70,15 @@
 #define RETRY_BOOT_COUNT 3
 
 #ifdef HAVE_BG_FILES
-static int  _block_is_deallocating(bg_record_t *bg_record);
+
+typedef struct {
+	int jobid;
+} kill_job_struct_t;
+
+List kill_job_list = NULL;
+
+static int _block_is_deallocating(bg_record_t *bg_record);
+static void _destroy_kill_struct(void *object);
 
 static int _block_is_deallocating(bg_record_t *bg_record)
 {
@@ -86,11 +94,17 @@ static int _block_is_deallocating(bg_record_t *bg_record)
 	} 
 	slurm_conf_unlock();
 	
+	if(jobid > -1) {
+		kill_job_struct_t *freeit = xmalloc(sizeof(kill_job_struct_t));
+		freeit->jobid = jobid;
+		list_push(kill_job_list, freeit);
+	}	
 	if(bg_record->target_name 
 	   && bg_record->user_name) {
 		if(!strcmp(bg_record->target_name, user_name)) {
 			if(strcmp(bg_record->target_name, 
-				  bg_record->user_name)) {
+				  bg_record->user_name)
+			   || (jobid > -1)) {
 				error("Block %s was in a ready state "
 				      "for user %s but is being freed. "
 				      "Job %d was lost.",
@@ -98,8 +112,6 @@ static int _block_is_deallocating(bg_record_t *bg_record)
 				      bg_record->user_name,
 				      jobid);
 				
-				if(jobid > -1)
-					slurm_fail_job(jobid);
 				if(remove_from_bg_list(bg_job_block_list, 
 						       bg_record) 
 				   == SLURM_SUCCESS) {
@@ -136,6 +148,15 @@ static int _block_is_deallocating(bg_record_t *bg_record)
 			
 	return SLURM_SUCCESS;
 }
+static void _destroy_kill_struct(void *object)
+{
+	kill_job_struct_t *freeit = (kill_job_struct_t *)object;
+
+	if(freeit) {
+		xfree(freeit);
+	}
+}
+
 #endif
 
 
@@ -210,8 +231,12 @@ extern int update_block_list()
 	bg_record_t *bg_record = NULL;
 	time_t now;
 	int skipped_dealloc = 0;
+	kill_job_struct_t *freeit = NULL;
 	ListIterator itr = NULL;
 	
+	if(!kill_job_list)
+		kill_job_list = list_create(_destroy_kill_struct);
+
 	if(!bg_list) 
 		return updated;
 	
@@ -363,7 +388,12 @@ extern int update_block_list()
 			case RM_PARTITION_READY:
 				debug("block %s is ready.",
 				      bg_record->bg_block_id);
-				set_block_user(bg_record); 	
+				if(set_block_user(bg_record) == SLURM_ERROR) {
+					freeit = xmalloc(
+						sizeof(kill_job_struct_t));
+					freeit->jobid = bg_record->job_running;
+					list_push(kill_job_list, freeit);
+				}
 				break;
 			default:
 				debug("Hey the state of the "
@@ -381,6 +411,13 @@ extern int update_block_list()
 	}
 	list_iterator_destroy(itr);
 	slurm_mutex_unlock(&block_state_mutex);
+	
+	/* kill all the jobs from unexpectedly freed blocks */
+	while((freeit = list_pop(kill_job_list))) {
+		debug2("killing job %d", freeit->jobid);
+		(void) slurm_fail_job(freeit->jobid);
+		_destroy_kill_struct(freeit);
+	}
 		
 #endif
 	return updated;
diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c
index e65a45cc3d6..107ae095a90 100644
--- a/src/plugins/select/bluegene/plugin/bg_job_run.c
+++ b/src/plugins/select/bluegene/plugin/bg_job_run.c
@@ -253,15 +253,14 @@ static void _start_agent(bg_update_t *bg_update_ptr)
 	bg_record = 
 		find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id);
 
-	slurm_mutex_lock(&block_state_mutex);
 	if(!bg_record) {
-		slurm_mutex_unlock(&block_state_mutex);
 		error("block %s not found in bg_list",
 		      bg_update_ptr->bg_block_id);
 		(void) slurm_fail_job(bg_update_ptr->job_id);
 		slurm_mutex_unlock(&job_start_mutex);
 		return;
 	}
+	slurm_mutex_lock(&block_state_mutex);
 	if(bg_record->job_running <= NO_JOB_RUNNING) {
 		slurm_mutex_unlock(&block_state_mutex);
 		slurm_mutex_unlock(&job_start_mutex);
@@ -545,7 +544,7 @@ static void _term_agent(bg_update_t *bg_update_ptr)
 static void *_block_agent(void *args)
 {
 	bg_update_t *bg_update_ptr = NULL;
-	
+				
 	/*
 	 * Don't just exit when there is no work left. Creating 
 	 * pthreads from within a dynamically linked object (plugin)
@@ -562,7 +561,7 @@ static void *_block_agent(void *args)
 		}
 		if (bg_update_ptr->op == START_OP)
 			_start_agent(bg_update_ptr);
-		else if (bg_update_ptr->op == TERM_OP) 
+		else if (bg_update_ptr->op == TERM_OP)
 			_term_agent(bg_update_ptr);
 		else if (bg_update_ptr->op == SYNC_OP)
 			_sync_agent(bg_update_ptr);
diff --git a/src/plugins/select/bluegene/plugin/bg_switch_connections.c b/src/plugins/select/bluegene/plugin/bg_switch_connections.c
index 2cb58413e5e..641a329b506 100644
--- a/src/plugins/select/bluegene/plugin/bg_switch_connections.c
+++ b/src/plugins/select/bluegene/plugin/bg_switch_connections.c
@@ -228,7 +228,7 @@ static int _lookat_path(bg_bp_t *bg_bp, ba_switch_t *curr_switch,
 		if(port_tar == curr_switch->ext_wire[port_tar].port_tar) {
 			//list_delete(conn_itr);
 			//continue;
-			debug3("I found these %d %d",port_tar, 
+			debug3("I found these %d %d", port_tar, 
 			       curr_switch->ext_wire[port_tar].port_tar);
 		}
 		if(((bg_conn->source == port_tar)
@@ -261,16 +261,16 @@ static int _lookat_path(bg_bp_t *bg_bp, ba_switch_t *curr_switch,
 	/* set source to the node you are on */
 	node_src = curr_switch->ext_wire[0].node_tar;
 
-	debug("dim %d trying from %d%d%d %d -> %d%d%d %d",
-	      dim,
-	      node_src[X], 
-	      node_src[Y], 
-	      node_src[Z],
-	      port_tar1,
-	      node_tar[X], 
-	      node_tar[Y], 
-	      node_tar[Z],
-	      port_tar);
+	debug2("dim %d trying from %d%d%d %d -> %d%d%d %d",
+	       dim,
+	       node_src[X], 
+	       node_src[Y], 
+	       node_src[Z],
+	       port_tar1,
+	       node_tar[X], 
+	       node_tar[Y], 
+	       node_tar[Z],
+	       port_tar);
 
 
 	bg_itr = list_iterator_create(bg_bp_list);
@@ -520,13 +520,26 @@ extern int configure_block_switches(bg_record_t * bg_record)
 	int first_bp=1;
 	int first_switch=1;
 	
+	if(!bg_record->bg_block_list) {
+		error("There was no block_list given, can't create block");
+		return SLURM_ERROR;
+	}
+
 	bg_bp_list = list_create(NULL);
 	bg_record->switch_count = 0;
 	bg_record->bp_count = 0;
 		
 	itr = list_iterator_create(bg_record->bg_block_list);
 	while ((ba_node = (ba_node_t *) list_next(itr)) != NULL) {
-		debug2("node %d%d%d",
+		if(!ba_node->used) {
+			debug3("%d%d%d is a passthrough, "
+			       "not including in request",
+			       ba_node->coord[X], 
+			       ba_node->coord[Y], 
+			       ba_node->coord[Z]);
+			continue;
+		}
+		debug2("using node %d%d%d",
 		       ba_node->coord[X], 
 		       ba_node->coord[Y], 
 		       ba_node->coord[Z]);
diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c
index 2cf46e14b6f..8e7c4deadcf 100644
--- a/src/plugins/select/bluegene/plugin/bluegene.c
+++ b/src/plugins/select/bluegene/plugin/bluegene.c
@@ -100,6 +100,7 @@ static int  _addto_node_list(bg_record_t *bg_record, int *start, int *end);
 static void _set_bg_lists();
 static int  _validate_config_nodes(void);
 static int  _bg_record_cmpf_inc(bg_record_t *rec_a, bg_record_t *rec_b);
+static int  _ba_node_cmpf_inc(ba_node_t *node_a, ba_node_t *node_b);
 static int _delete_old_blocks(void);
 static char *_get_bg_conf(void);
 static int _add_block_db(bg_record_t *bg_record, int *block_inx);
@@ -358,9 +359,16 @@ extern void process_nodes(bg_record_t *bg_record)
 	end[X] = -1;
 	end[Y] = -1;
 	end[Z] = -1;
-	
+
+	list_sort(bg_record->bg_block_list, (ListCmpF) _ba_node_cmpf_inc);
+
 	itr = list_iterator_create(bg_record->bg_block_list);
 	while ((ba_node = list_next(itr)) != NULL) {
+		debug4("%d%d%d is included in this block",
+		       ba_node->coord[X],
+		       ba_node->coord[Y],
+		       ba_node->coord[Z]);
+		       
 		if(ba_node->coord[X]>end[X]) {
 			bg_record->geo[X]++;
 			end[X] = ba_node->coord[X];
@@ -375,10 +383,12 @@ extern void process_nodes(bg_record_t *bg_record)
 		}
 	}
 	list_iterator_destroy(itr);
-	debug3("geo = %d%d%d\n",
+	debug3("geo = %d%d%d bp count is %d\n",
 	       bg_record->geo[X],
 	       bg_record->geo[Y],
-	       bg_record->geo[Z]);
+	       bg_record->geo[Z],
+	       bg_record->bp_count);
+
 	if ((bg_record->geo[X] == DIM_SIZE[X])
 	    && (bg_record->geo[Y] == DIM_SIZE[Y])
 	    && (bg_record->geo[Z] == DIM_SIZE[Z]))
@@ -425,6 +435,7 @@ extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record)
 	sec_record->switch_count = fir_record->switch_count;
 	sec_record->boot_state = fir_record->boot_state;
 	sec_record->boot_count = fir_record->boot_count;
+	sec_record->full_block = fir_record->full_block;
 
 	for(i=0;i<BA_SYSTEM_DIMENSIONS;i++) {
 		sec_record->geo[i] = fir_record->geo[i];
@@ -755,7 +766,11 @@ extern int remove_all_users(char *bg_block_id, char *user_name)
 	return returnc;
 }
 
-extern void set_block_user(bg_record_t *bg_record) 
+/* if SLURM_ERROR you will need to fail the job with
+   slurm_fail_job(bg_record->job_running);
+*/
+
+extern int set_block_user(bg_record_t *bg_record) 
 {
 	int rc = 0;
 	debug("resetting the boot state flag and "
@@ -766,16 +781,18 @@ extern void set_block_user(bg_record_t *bg_record)
 	slurm_conf_lock();
 	if((rc = update_block_user(bg_record, 1)) == 1) {
 		last_bg_update = time(NULL);
+		rc = SLURM_SUCCESS;
 	} else if (rc == -1) {
 		error("Unable to add user name to block %s. "
 		      "Cancelling job.",
 		      bg_record->bg_block_id);
-		(void) slurm_fail_job(bg_record->job_running);
+		rc = SLURM_ERROR;
 	}	
 	xfree(bg_record->target_name);
 	bg_record->target_name = 
 		xstrdup(slurmctld_conf.slurm_user_name);
-	slurm_conf_unlock();			
+	slurm_conf_unlock();	
+	return rc;
 }
 
 extern char* convert_lifecycle(lifecycle_type_t lifecycle)
@@ -937,6 +954,7 @@ extern int create_defined_blocks(bg_layout_t overlapped)
 				      "no bg_found_block_list 1");
 			}
 			if(bg_record->bp_count>0 
+			   && !bg_record->full_block
 			   && bg_record->cpus_per_bp == procs_per_node) {
 				char *name = NULL;
 				if(overlapped == LAYOUT_OVERLAP)
@@ -996,6 +1014,22 @@ extern int create_defined_blocks(bg_layout_t overlapped)
 				}
 			}
 			if(found_record == NULL) {
+				if(bg_record->full_block) {
+					/* if this is defined we need
+					   to remove it since we are
+					   going to try to create it
+					   later on overlap systems
+					   this doesn't matter, but
+					   since we don't clear the
+					   table on static mode we
+					   can't do it here or it just
+					   won't work since other
+					   wires will be or are
+					   already set
+					*/ 
+					list_remove(itr);
+					continue;
+				}
 #ifdef HAVE_BG_FILES
 				if((rc = configure_block(bg_record)) 
 				   == SLURM_ERROR) {
@@ -1952,6 +1986,8 @@ static int _validate_config_nodes(void)
 #ifdef HAVE_BG_FILES
 	bg_record_t* bg_record = NULL;	
 	bg_record_t* init_bg_record = NULL;
+	bg_record_t* full_system_bg_record = NULL;	
+	int full_created = 0;
 	ListIterator itr_conf;
 	ListIterator itr_curr;
 	rm_partition_mode_t node_use;
@@ -1962,7 +1998,15 @@ static int _validate_config_nodes(void)
 	
 	if(!bg_recover) 
 		return SLURM_ERROR;
-	
+
+	if(!bg_curr_block_list)
+		return SLURM_ERROR;
+
+	itr_curr = list_iterator_create(bg_curr_block_list);
+	while ((init_bg_record = list_next(itr_curr))) 
+		if(init_bg_record->full_block) 
+			full_system_bg_record = init_bg_record;	
+		
 	itr_conf = list_iterator_create(bg_list);
 	while ((bg_record = (bg_record_t*) list_next(itr_conf))) {
 		/* translate hostlist to ranged 
@@ -1970,7 +2014,7 @@ static int _validate_config_nodes(void)
 		   search here 
 		*/
 		node_use = SELECT_COPROCESSOR_MODE; 
-		itr_curr = list_iterator_create(bg_curr_block_list);
+		list_iterator_reset(itr_curr);
 		while ((init_bg_record = list_next(itr_curr))) {
 			if (strcasecmp(bg_record->nodes, 
 				       init_bg_record->nodes))
@@ -1988,7 +2032,6 @@ static int _validate_config_nodes(void)
 				       bg_record);
 			break;
 		}
-		list_iterator_destroy(itr_curr);
 			
 		if (!bg_record->bg_block_id) {
 			format_node_name(bg_record, tmp_char);	
@@ -1997,6 +2040,9 @@ static int _validate_config_nodes(void)
 			     tmp_char);
 			rc = SLURM_ERROR;
 		} else {
+			if(bg_record->full_block)
+				full_created = 1;
+
 			list_push(bg_found_block_list, bg_record);
 			format_node_name(bg_record, tmp_char);
 			info("Existing: BlockID:%s Nodes:%s Conn:%s",
@@ -2011,8 +2057,26 @@ static int _validate_config_nodes(void)
 		}
 	}		
 	list_iterator_destroy(itr_conf);
+	list_iterator_destroy(itr_curr);
 	if(bluegene_layout_mode == LAYOUT_DYNAMIC)
 		goto finished;
+
+	if(!full_created && full_system_bg_record) {
+		bg_record = xmalloc(sizeof(bg_record_t));
+		copy_bg_record(full_system_bg_record, bg_record);
+		list_push(bg_list, bg_record);
+		list_push(bg_found_block_list, bg_record);
+		format_node_name(bg_record, tmp_char);
+		info("Existing: BlockID:%s Nodes:%s Conn:%s",
+		     bg_record->bg_block_id, 
+		     tmp_char,
+		     convert_conn_type(bg_record->conn_type));
+		if(((bg_record->state == RM_PARTITION_READY)
+		    || (bg_record->state == RM_PARTITION_CONFIGURING))
+		   && !block_exist_in_list(bg_booted_block_list, 
+					   bg_record))
+			list_push(bg_booted_block_list, bg_record);
+	}
 		
 finished:
 	if(list_count(bg_list) == list_count(bg_curr_block_list))
@@ -2056,6 +2120,30 @@ static int _bg_record_cmpf_inc(bg_record_t* rec_a, bg_record_t* rec_b)
 	return 0;
 }
 
+static int _ba_node_cmpf_inc(ba_node_t *node_a, ba_node_t *node_b)
+{
+	if (node_a->coord[X] < node_b->coord[X])
+		return -1;
+	else if (node_a->coord[X] > node_b->coord[X])
+		return 1;
+	
+	if (node_a->coord[Y] < node_b->coord[Y])
+		return -1;
+	else if (node_a->coord[Y] > node_b->coord[Y])
+		return 1;
+
+	if (node_a->coord[Z] < node_b->coord[Z])
+		return -1;
+	else if (node_a->coord[Z] > node_b->coord[Z])
+		return 1;
+
+	error("You have the node %d%d%d in the list twice",
+	      node_a->coord[X],
+	      node_a->coord[Y],
+	      node_a->coord[Z]); 
+	return 0;
+}
+
 static int _delete_old_blocks(void)
 {
 #ifdef HAVE_BG_FILES
diff --git a/src/plugins/select/bluegene/plugin/bluegene.h b/src/plugins/select/bluegene/plugin/bluegene.h
index 5f31a3a38a8..fd586d77d95 100644
--- a/src/plugins/select/bluegene/plugin/bluegene.h
+++ b/src/plugins/select/bluegene/plugin/bluegene.h
@@ -198,7 +198,7 @@ extern bool blocks_overlap(bg_record_t *rec_a, bg_record_t *rec_b);
 #define REMOVE_USER_NONE  0
 #define REMOVE_USER_FOUND 2
 extern int remove_all_users(char *bg_block_id, char *user_name);
-extern void set_block_user(bg_record_t *bg_record);
+extern int set_block_user(bg_record_t *bg_record);
 
 /* Return strings representing blue gene data types */
 extern char *convert_lifecycle(lifecycle_type_t lifecycle);
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index b2b5ee498b2..4504205280e 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -1031,6 +1031,7 @@ static int _shutdown_backup_controller(int wait_time)
 {
 	int rc;
 	slurm_msg_t req;
+	DEF_TIMERS;
 
 	slurm_msg_t_init(&req);
 	if ((slurmctld_conf.backup_addr == NULL) ||
@@ -1045,13 +1046,14 @@ static int _shutdown_backup_controller(int wait_time)
 	/* send request message */
 	req.msg_type = REQUEST_CONTROL;
 	
+	START_TIMER;
 	if (slurm_send_recv_rc_msg_only_one(&req, &rc, CONTROL_TIMEOUT) < 0) {
-		error("shutdown_backup:send/recv: %m");
+		END_TIMER;
+		error("_shutdown_backup_controller:send/recv: %m, %s", TIME_STR);
 		return SLURM_ERROR;
 	}
-
 	if (rc) {
-		error("shutdown_backup: %s", slurm_strerror(rc));
+		error("_shutdown_backup_controller: %s", slurm_strerror(rc));
 		return SLURM_ERROR;
 	}
 	debug("backup controller has relinquished control");
diff --git a/testsuite/expect/test9.7.bash b/testsuite/expect/test9.7.bash
index 25dfb8198ba..102349ed754 100755
--- a/testsuite/expect/test9.7.bash
+++ b/testsuite/expect/test9.7.bash
@@ -53,14 +53,15 @@ else
 	iterations=3
 fi
 
-if [ $5 ]; then
-    inx=512
-else
-    inx=1
-fi
+bluegene=0
+if [ $# -gt 5 ]; then
+	if  [ $5 ]; then
+		bluegene=1
+	fi
+fi	
 
 exit_code=0
-
+inx=1
 log="test9.7.$$.output"
 touch $log
 while [ $inx -le $iterations ]
@@ -73,7 +74,11 @@ do
 		exit_code=$rc
 	fi
 	sleep $sleep_time
-	$exec2 -N1-$inx -n$inx -O -s -l hostname         >>$log 2>&1
+	if [ $bluegene ]; then
+		$exec2 -N1-512 -n1 -s -l hostname         >>$log 2>&1
+	else
+		$exec2 -N1-$inx -n$inx -O -s -l hostname  >>$log 2>&1
+	fi
 	rc=$?
 	if [ $rc -ne 0 ]; then
 		echo "exec2 rc=$rc" >> $log
-- 
GitLab