From 052080f8be7be23a1fdc01ea93fb801a7b2bc08a Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 15 Jul 2009 20:38:01 +0000
Subject: [PATCH] svn merge -r18132:18151
 https://eris.llnl.gov/svn/slurm/branches/slurm-2.0

---
 NEWS                                          | 15 ++++-
 .../block_allocator/block_allocator.c         | 31 +++++++++-
 .../select/bluegene/plugin/bg_block_info.c    | 47 +++++++--------
 .../select/bluegene/plugin/bg_job_run.c       |  3 +-
 .../bluegene/plugin/bg_record_functions.c     | 58 ++++++++-----------
 .../select/bluegene/plugin/select_bluegene.c  | 22 +++----
 src/slurmctld/agent.c                         |  5 +-
 7 files changed, 104 insertions(+), 77 deletions(-)

diff --git a/NEWS b/NEWS
index 168d3f984ca..19b54c25002 100644
--- a/NEWS
+++ b/NEWS
@@ -90,6 +90,19 @@ documents those changes that are of interest to users and admins.
  -- BLUEGENE - add support for scontrol show blocks.
  -- Added support for job step time limits.
 
+* Changes in SLURM 2.0.5
+========================
+ -- BLUEGENE - Added support for emulating systems with a X-dimension of 4.
+ -- BLUEGENE - When a nodecard goes down on a non-Dynamic system SLURM will 
+    now only drain blocks under 1 midplane, if no such block exists then SLURM 
+    will drain the entire midplane and not mark any block in error state.  
+    Previously SLURM would drain every overlapping block of the nodecard 
+    making it possible for a large block to make other blocks not work since 
+    they overlap some other part of the block that really isn't bad.
+ -- BLUEGENE - Handle L3 errors on boot better.
+ -- Don't revoke a pending batch launch request from the slurmctld if the
+    job is immediately suspended (a normal event with gang scheduling).
+
 * Changes in SLURM 2.0.4
 ========================
  -- Permit node suspend/resume logic to be enabled through "scontrol reconfig"
@@ -135,7 +148,7 @@ documents those changes that are of interest to users and admins.
     to properly handle user names that contain all digits. Return error code 
     from uid_from_string() and gid_from_string() functions rather than a uid of
     -1, which might be a valid uid or gid on some systems.
- -- Fix in re-calcuation of job priorities do to DOWN or DRAINED nodes.
+ -- Fix in re-calcuation of job priorities due to DOWN or DRAINED nodes.
 
 * Changes in SLURM 2.0.3
 ========================
diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c
index 2eb93254d98..9377654475a 100644
--- a/src/plugins/select/bluegene/block_allocator/block_allocator.c
+++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c
@@ -3779,7 +3779,7 @@ static bool _node_used(ba_node_t* ba_node, int x_size)
 			return true;
 		}
 	}
-		
+	
 	return false;
 
 }
@@ -3986,6 +3986,35 @@ static int _set_external_wires(int dim, int count, ba_node_t* source,
 
 	/* set up split x */
 	if(DIM_SIZE[X] == 1) {
+	} else if(DIM_SIZE[X] == 4) {
+		switch(count) {
+		case 0:
+		case 3:
+			/* 0 and 3rd Node */
+			/* nothing */
+			break;
+		case 1:
+			/* 1st Node */
+			target = &ba_system_ptr->grid[0]
+				[source->coord[Y]]
+				[source->coord[Z]];
+			/* 4->3 of 0th */
+			_switch_config(source, target, dim, 4, 3);
+			break;	
+		case 2:
+			/* 2nd Node */
+			target = &ba_system_ptr->grid[3]
+				[source->coord[Y]]
+				[source->coord[Z]];
+			/* 4->3 of 3rd and back */
+			_switch_config(source, target, dim, 4, 3);
+			_switch_config(source, target, dim, 3, 4);
+			break;
+		default:
+			fatal("got %d for a count on a %d X-dim system",
+			      count, DIM_SIZE[X]);
+			break;
+		}
 	} else if(DIM_SIZE[X] == 5) {
 		/* 4 X dim fixes for wires */
 		switch(count) {
diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c
index 58df0ca635f..3b70f7367dc 100644
--- a/src/plugins/select/bluegene/plugin/bg_block_info.c
+++ b/src/plugins/select/bluegene/plugin/bg_block_info.c
@@ -412,7 +412,18 @@ extern int update_block_list()
 				}
 				remove_from_bg_list(bg_lists->booted,
 						    bg_record);
-			} 
+			} else if(bg_record->state == RM_PARTITION_ERROR) {
+				if(bg_record->boot_state == 1)
+					error("Block %s in an error "
+					      "state while booting.",
+					      bg_record->bg_block_id);
+				else					
+					error("Block %s in an error state.",
+					      bg_record->bg_block_id);
+				remove_from_bg_list(bg_lists->booted,
+						    bg_record);
+				trigger_block_error();
+			}
 			updated = 1;
 			
 		}
@@ -435,32 +446,14 @@ extern int update_block_list()
 						JOB_CONFIGURING;
 				break;
 			case RM_PARTITION_ERROR:
-				bg_record->boot_state = 0;
-				bg_record->boot_count = 0;
-				if(bg_record->job_running > NO_JOB_RUNNING) {
-					error("Block %s in an error "
-					      "state while booting.  "
-					      "Failing job %u.",
-					      bg_record->bg_block_id,
-					      bg_record->job_running);
-					freeit = xmalloc(
-						sizeof(kill_job_struct_t));
-					freeit->jobid = bg_record->job_running;
-					list_push(kill_job_list, freeit);
-					if(remove_from_bg_list(
-						   bg_lists->job_running, 
-						   bg_record) 
-					   == SLURM_SUCCESS) {
-						num_unused_cpus += 
-							bg_record->cpu_cnt;
-					} 
-				} else 
-					error("block %s in an error "
-					      "state while booting.",
-					      bg_record->bg_block_id);
-				remove_from_bg_list(bg_lists->booted,
-						    bg_record);
-				trigger_block_error();
+				/* If we get an error on boot that
+				 * means it is a transparent L3 error
+				 * and should be trying to fix
+				 * itself.  If this is the case we
+				 * just hang out waiting for the state
+				 * to go to free where we will try to
+				 * boot again below.
+				 */
 				break;
 			case RM_PARTITION_FREE:
 				if(bg_record->boot_count < RETRY_BOOT_COUNT) {
diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c
index 2d3cff7fb13..bbb9f4a06aa 100644
--- a/src/plugins/select/bluegene/plugin/bg_job_run.c
+++ b/src/plugins/select/bluegene/plugin/bg_job_run.c
@@ -163,7 +163,8 @@ static int _remove_job(db_job_id_t job_id)
 		else if(job_state == RM_JOB_DYING) {
 			if(count > MAX_POLL_RETRIES) 
 				error("Job %d isn't dying, trying for "
-				      "%d seconds", count*POLL_INTERVAL);
+				      "%d seconds", job_id, 
+				      count*POLL_INTERVAL);
 			continue;
 		} else if(job_state == RM_JOB_ERROR) {
 			error("job %d is in a error state.", job_id);
diff --git a/src/plugins/select/bluegene/plugin/bg_record_functions.c b/src/plugins/select/bluegene/plugin/bg_record_functions.c
index b3cf6db8a36..055c8b8a63e 100644
--- a/src/plugins/select/bluegene/plugin/bg_record_functions.c
+++ b/src/plugins/select/bluegene/plugin/bg_record_functions.c
@@ -510,7 +510,7 @@ extern bg_record_t *find_bg_record_in_list(List my_list, char *bg_block_id)
 	itr = list_iterator_create(my_list);
 	while((bg_record = list_next(itr))) {
 		if(bg_record->bg_block_id)
-			if(!strcmp(bg_record->bg_block_id, bg_block_id))
+			if (!strcasecmp(bg_record->bg_block_id, bg_block_id))
 				break;
 	}
 	list_iterator_destroy(itr);
@@ -1067,26 +1067,19 @@ extern int down_nodecard(char *bp_name, bitoff_t io_start)
 		if(bg_record->job_running > NO_JOB_RUNNING) 
 			slurm_fail_job(bg_record->job_running);
 
-		/* mark every one of these in an error state */
-		if(bg_conf->layout_mode != LAYOUT_DYNAMIC) {
-			if(!delete_list)
-				delete_list = list_create(NULL);
-			list_append(delete_list, bg_record);
-			continue;
-		} 
-
-		/* below is only for dynamic modes since there are
-		   never overlapping blocks there */
-		/* if the block is smaller than the create size just
-		   continue on.
+		/* If Running Dynamic mode and the the block is
+		   smaller than the create size just continue on.
 		*/
-		if(bg_record->node_cnt < create_size) {
+		if((bg_conf->layout_mode == LAYOUT_DYNAMIC)
+		   && (bg_record->node_cnt < create_size)) {
 			if(!delete_list)
 				delete_list = list_create(NULL);
 			list_append(delete_list, bg_record);
 			continue;
 		}
 
+		/* keep track of the smallest size that is at least
+		   the size of create_size. */
 		if(!smallest_bg_record || 
 		   (smallest_bg_record->node_cnt > bg_record->node_cnt))
 			smallest_bg_record = bg_record;
@@ -1096,27 +1089,24 @@ extern int down_nodecard(char *bp_name, bitoff_t io_start)
 	
 	if(bg_conf->layout_mode != LAYOUT_DYNAMIC) {
 		debug3("running non-dynamic mode");
-		if(delete_list) {
-			int cnt_set = 0;
-			/* don't lock here since it is handled inside
-			   the put_block_in_error_state
-			*/
-			itr = list_iterator_create(delete_list);
-			while ((bg_record = list_next(itr))) {
-				/* we already handled this */
-				if(bg_record->state == RM_PARTITION_ERROR) {
-					rc = SLURM_NO_CHANGE_IN_DATA;
-					continue;
-				}
-								
-				rc = put_block_in_error_state(
-					bg_record, BLOCK_ERROR_STATE);
-				cnt_set++;
-			}
-			if(cnt_set)
-				rc = SLURM_SUCCESS;
-			list_iterator_destroy(itr);
+		
+		/* This should never happen, but just in case... */
+		if(delete_list) 
 			list_destroy(delete_list);
+
+		/* If we found a block that is smaller or equal to a
+		   midplane we will just mark it in an error state as
+		   opposed to draining the node.  
+		*/
+		if(smallest_bg_record 
+		   && (smallest_bg_record->node_cnt <= bg_conf->bp_node_cnt)){
+			if(smallest_bg_record->state == RM_PARTITION_ERROR) {
+				rc = SLURM_NO_CHANGE_IN_DATA;
+				goto cleanup;
+			}
+			
+			rc = put_block_in_error_state(
+				smallest_bg_record, BLOCK_ERROR_STATE);
 			goto cleanup;
 		} 
 		
diff --git a/src/plugins/select/bluegene/plugin/select_bluegene.c b/src/plugins/select/bluegene/plugin/select_bluegene.c
index 3a1ba5cf4df..97baa5e6902 100644
--- a/src/plugins/select/bluegene/plugin/select_bluegene.c
+++ b/src/plugins/select/bluegene/plugin/select_bluegene.c
@@ -867,17 +867,19 @@ extern int select_p_update_sub_node (update_part_msg_t *part_desc_ptr)
 				rc = SLURM_ERROR;
 				goto end_it;
 			}
+			/* make sure we are asking for a correct name */
 			for(i = 0; i < BA_SYSTEM_DIMENSIONS; i++) {
-				if((part_desc_ptr->name[i] >= '0'
-				    && part_desc_ptr->name[i] <= '9')
-				   || (part_desc_ptr->name[i] >= 'A'
-				      && part_desc_ptr->name[i] <= 'Z')) {
-					error("update_sub_node: "
-					      "misformatted name given %s",
-					      part_desc_ptr->name);
-					rc = SLURM_ERROR;
-					goto end_it;
-				}
+				if((part_desc_ptr->name[j+i] >= '0'
+				    && part_desc_ptr->name[j+i] <= '9')
+				   || (part_desc_ptr->name[j+i] >= 'A'
+				      && part_desc_ptr->name[j+i] <= 'Z')) 
+					continue;
+				
+				error("update_sub_node: "
+				      "misformatted name given %s",
+				      part_desc_ptr->name);
+				rc = SLURM_ERROR;
+				goto end_it;
 			}
 			
 			strncpy(coord, part_desc_ptr->name+j,
diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 1d5262ce488..2a0180752fb 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -1476,8 +1476,6 @@ extern void mail_job_info (struct job_record *job_ptr, uint16_t mail_type)
 	return;
 }
 
-/* return true if the requests is to launch a batch job and the message
- * destination is not yet powered up, otherwise return false */
 /* Test if a batch launch request should be defered
  * RET -1: abort the request, pending job cancelled
  *      0: execute the request now
@@ -1503,7 +1501,8 @@ static int _batch_launch_defer(queued_request_t *queued_req_ptr)
 
 	launch_msg_ptr = (batch_job_launch_msg_t *)agent_arg_ptr->msg_args;
 	job_ptr = find_job_record(launch_msg_ptr->job_id);
-	if ((job_ptr == NULL) || (!IS_JOB_RUNNING(job_ptr))) {
+	if ((job_ptr == NULL) || 
+	    (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) {
 		info("agent(batch_launch): removed pending request for "
 		     "cancelled job %u",
 		     launch_msg_ptr->job_id);
-- 
GitLab