From dc5925f11aef5545ff77f047b877d92bbd6f9886 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 8 Mar 2010 22:38:28 +0000
Subject: [PATCH] In select/cons_res, allocated cores for a job using a
 best-fit approach.

---
 NEWS                                     |   1 +
 src/plugins/select/cons_res/dist_tasks.c | 163 +++++++++++++++++++----
 2 files changed, 139 insertions(+), 25 deletions(-)

diff --git a/NEWS b/NEWS
index 1cde1cfe129..0e75e5bdf2f 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,7 @@ documents those changes that are of interest to users and admins.
  -- In select/cons_res, the count of CPUs on required nodes was formerly
     ignored in enforcing the maximum CPU limit. Also enforce maximum CPU
     limit when the topology/tree plugin is configured (previously ignored).
+ -- In select/cons_res, allocated cores for a job using a best-fit approach.
 
 * Changes in SLURM 2.2.0.pre2
 =============================
diff --git a/src/plugins/select/cons_res/dist_tasks.c b/src/plugins/select/cons_res/dist_tasks.c
index 18ceba8f113..6f54e55d4c0 100644
--- a/src/plugins/select/cons_res/dist_tasks.c
+++ b/src/plugins/select/cons_res/dist_tasks.c
@@ -165,7 +165,8 @@ static int _compute_plane_dist(struct job_record *job_ptr)
 	return SLURM_SUCCESS;
 }
 
-/* sync up core bitmap with new CPU count
+/* sync up core bitmap with new CPU count using a best-fit approach
+ * on the available sockets
  *
  * The CPU array contains the distribution of CPUs, which can include
  * virtual CPUs (hyperthreads)
@@ -173,11 +174,19 @@ static int _compute_plane_dist(struct job_record *job_ptr)
 static void _block_sync_core_bitmap(struct job_record *job_ptr,
 				    const uint16_t cr_type)
 {
-	uint32_t c, i, n, size, csize, core_cnt;
+	uint32_t c, s, i, j, n, size, csize, core_cnt;
 	uint16_t cpus, num_bits, vpus = 1;
 	job_resources_t *job_res = job_ptr->job_resrcs;
 	bool alloc_cores = false, alloc_sockets = false;
 	uint16_t ntasks_per_core = 0xffff;
+	int* sockets_cpu_cnt;
+	bool* sockets_used;
+	uint16_t sockets_nb;
+	uint16_t ncores_nb;
+	uint16_t nsockets_nb;
+	uint16_t req_cpus,best_fit_cpus = 0;
+	uint32_t best_fit_location = 0;
+	bool sufficient,best_fit_sufficient;
 
 	if (!job_res)
 		return;
@@ -198,53 +207,157 @@ static void _block_sync_core_bitmap(struct job_record *job_ptr,
 
 	size  = bit_size(job_res->node_bitmap);
 	csize = bit_size(job_res->core_bitmap);
+
+	sockets_nb  = select_node_record[0].sockets;
+	sockets_cpu_cnt = xmalloc(sockets_nb * sizeof(int));	
+	sockets_used = xmalloc(sockets_nb * sizeof(bool));
+
 	for (c = 0, i = 0, n = 0; n < size; n++) {
 
 		if (bit_test(job_res->node_bitmap, n) == 0)
 			continue;
+
 		core_cnt = 0;
-		num_bits = select_node_record[n].sockets *
-				select_node_record[n].cores;
+		ncores_nb = select_node_record[n].cores;
+		nsockets_nb = select_node_record[n].sockets;
+		num_bits =  nsockets_nb * ncores_nb;
+				
 		if ((c + num_bits) > csize)
 			fatal ("cons_res: _block_sync_core_bitmap index error");
 
 		cpus  = job_res->cpus[i];
 		vpus  = MIN(select_node_record[n].vpus, ntasks_per_core);
 
-		while ((cpus > 0) && (num_bits > 0)) {
-			if (bit_test(job_res->core_bitmap, c++)) {
-				core_cnt++;
-				if (cpus < vpus)
-					cpus = 0;
-				else
-					cpus -= vpus;
+		if ( nsockets_nb > sockets_nb) {
+			sockets_nb = nsockets_nb;
+			xrealloc(sockets_cpu_cnt, sockets_nb * sizeof(int));
+			xrealloc(sockets_used,sockets_nb * sizeof(bool));
+		}
+
+		/* count cores provided by each socket */
+		for (s = 0; s < nsockets_nb; s++) {
+			sockets_cpu_cnt[s]=0;
+			sockets_used[s]=false;
+			for ( j = c + (s * ncores_nb) ; 
+			      j < c + ((s+1) * ncores_nb) ; 
+			      j++ ) {
+				if ( bit_test(job_res->core_bitmap,j) )
+					sockets_cpu_cnt[s]++;
 			}
-			num_bits--;
 		}
+
+		/* select cores in the sockets using a best-fit approach */
+		while( cpus > 0 ) {
+			
+			best_fit_cpus = 0;
+			best_fit_sufficient = false;
+
+			/* compute still required cores on the node */
+			req_cpus = cpus / vpus;
+			if ( cpus % vpus )
+				req_cpus++;
+			
+			/* search for the best socket, */
+			/* starting from the last one to let more room */
+			/* in the first one for system usage */
+			for ( s = nsockets_nb - 1 ; (int) s >= (int) 0 ; s-- ) {
+				sufficient = sockets_cpu_cnt[s] >= req_cpus ;
+				if ( (best_fit_cpus == 0) ||
+				     (sufficient && !best_fit_sufficient ) ||
+				     (sufficient && (sockets_cpu_cnt[s] < 
+						     best_fit_cpus)) ||
+				     (!sufficient && (sockets_cpu_cnt[s] >
+						      best_fit_cpus)) ) {
+					best_fit_cpus = sockets_cpu_cnt[s];
+					best_fit_location = s;
+					best_fit_sufficient = sufficient;
+				}
+			}
+
+			/* check that we have found a usable socket */
+			if ( best_fit_cpus == 0 )
+				break;
+
+			debug3("dist_task: best_fit : using node[%lu]:"
+			       "socket[%lu] : %u cores available",
+			       n,best_fit_location,
+			       sockets_cpu_cnt[best_fit_location]);
+			
+			/* select socket cores from last to first */
+			/* socket[0]:Core[0] would be the last one */
+			sockets_used[best_fit_location] = true;
+			
+			for ( j = c + ((best_fit_location+1) * ncores_nb) 
+				      - 1 ;
+			      (int) j >= (int) (c + (best_fit_location * 
+						     ncores_nb)) ;
+			      j-- ) {
+
+				/*
+				 * if no more cpus to select
+				 * release remaining cores unless
+				 * we are allocating whole sockets
+				 */
+				if ( cpus == 0 && alloc_sockets ) {
+					if ( bit_test(job_res->core_bitmap,j) )
+						core_cnt++;
+					continue;
+				}
+				else if ( cpus == 0 ) {
+					bit_clear(job_res->core_bitmap,j);
+					continue;
+				}
+				
+				/*
+				 * remove cores from socket count and 
+				 * cpus count using hyperthreading requirement
+				 */
+				if ( bit_test(job_res->core_bitmap,j) ) {
+					sockets_cpu_cnt[best_fit_location]--;
+					core_cnt++;
+					if (cpus < vpus)
+						cpus = 0;
+					else
+						cpus -= vpus;
+				}
+
+			}
+			
+			/* loop again if more cpus required */
+			if ( cpus > 0 )
+				continue;
+
+			/* release remaining cores of the unused sockets */
+			for (s = 0; s < nsockets_nb; s++) {
+				if ( sockets_used[s] ) 
+					continue;
+				bit_nclear(job_res->core_bitmap,
+					   c+(s*ncores_nb),
+					   c+((s+1)*ncores_nb)-1);
+			}
+			
+		}
+
 		if (cpus > 0)
 			/* cpu count should NEVER be greater than the number
 			 * of set bits in the core bitmap for a given node */
 			fatal("cons_res: cpus computation error");
 
-		if (alloc_sockets) {	/* Advance to end of socket */
-			while ((num_bits > 0) &&
-			       (c % select_node_record[n].cores)) {
-				if (bit_test(job_res->core_bitmap, c++))
-					core_cnt++;
-				num_bits--;
-			}
-		}
-		while (num_bits > 0) {
-			bit_clear(job_res->core_bitmap, c++);
-			num_bits--;
-		}
+		/* adjust cpus count of the current node */
 		if ((alloc_cores || alloc_sockets) &&
 		    (select_node_record[n].vpus > 1)) {
 			job_res->cpus[i] = core_cnt *
-					   select_node_record[n].vpus;
+				select_node_record[n].vpus;
 		}
 		i++;
+		
+		/* move c to the next node in core_bitmap */
+		c += num_bits;
+
 	}
+
+	xfree(sockets_cpu_cnt);
+	xfree(sockets_used);
 }
 
 
-- 
GitLab