From 98f4b537f52c378c69fdfe8dcc7ee318f757d369 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 26 Aug 2004 23:41:44 +0000
Subject: [PATCH] Move node selection logic into new plugin (needed for Blue
 Gene).

---
 NEWS                                          |   4 +-
 doc/html/news.html                            |   7 +-
 doc/html/quickstart_admin.html                |  43 +-
 doc/html/schedplugins.html                    |   4 +-
 doc/html/selectplugins.html                   |  10 +-
 src/common/slurm_protocol_api.c               |  13 +
 src/common/slurm_protocol_api.h               |   5 +
 src/plugins/select/bluegene/select_bluegene.c |  26 +-
 src/plugins/select/linear/select_linear.c     | 253 ++++++++++-
 src/slurmctld/node_scheduler.c                | 410 ++++--------------
 src/slurmctld/select_plugin.c                 |   5 +-
 src/slurmctld/select_plugin.h                 |   2 +-
 12 files changed, 409 insertions(+), 373 deletions(-)

diff --git a/NEWS b/NEWS
index 968844dd600..91678fc4049 100644
--- a/NEWS
+++ b/NEWS
@@ -13,8 +13,8 @@ documents those changes that are of interest to users and admins.
  -- srun cancels created job if job step creation fails
  -- Added a lots of Blue Gene/L support logic: slurmd executes on a single 
     node to front-end the 512-CPU base-partitions (Blue Gene/L's nodes)
- -- Add SelectType configuration parameter for node selection plugin
- -- Add node selection plugin infrastructure (still need to move logic)
+ -- Add node selection plugin infrastructure, relocate existing logic 
+    to select/linear, add configuration parameter SelectType
  -- Modify node hashing algorithm for better performance on Blue Gene/L
 
 * Changes in SLURM 0.4.0-pre1
diff --git a/doc/html/news.html b/doc/html/news.html
index 2111fbd1667..19d9572a1d5 100644
--- a/doc/html/news.html
+++ b/doc/html/news.html
@@ -9,7 +9,7 @@
 <meta http-equiv="keywords" content="Simple Linux Utility for Resource Management, SLURM, resource management, 
 Linux clusters, high-performance computing, Livermore Computing">
 <meta name="LLNLRandR" content="UCRL-WEB-204324">
-<meta name="LLNLRandRdate" content="3 August 2004">
+<meta name="LLNLRandRdate" content="26 August 2004">
 <meta name="distribution" content="global">
 <meta name="description" content="Simple Linux Utility for Resource Management">
 <meta name="copyright"
@@ -80,7 +80,7 @@ Major enhancements include:
 <li>Support for the IBM AIX operating system.</li>
 <li>Support for the IBM Federation switch.</li>
 <li>Support for the IBM 
-<a href="http://www.research.ibm.com/bluegene/">IBM BlueGene/L</a> system.</li>
+<a href="http://www.research.ibm.com/bluegene/">IBM BlueGene</a> system.</li>
 <li>Checkpoint plugin added with support for IBM system checkpoint.</li>
 <li>I/O streams for all tasks on a node are transmitted through one pair of sockets 
 instead of distinct sockets for each task. This improves performance and scalability.</li>
@@ -94,6 +94,7 @@ not been finalized. Anyone desiring to perform SLURM development should notify
 <a href="mailto:slurm-dev@lists.llnl.gov">slurm-dev@lists.llnl.gov</a>
 to coordinate activies. Future development plans includes:
 <ul>
+<li>Support for the IBM Federation switch.</li>
 <li>Support of various MPI types via a plugin mechanism.</li>
 <li>Permit resource allocations (jobs) to change size.</li>
 <li>Manage consumable resources on a per-node (e.g,. memory, disk space) 
@@ -114,7 +115,7 @@ and system-wide basis (e.g., licenses).</li>
 <td colspan="3"><hr> <p>For information about this page, contact <a href="mailto:slurm-dev@lists.llnl.gov">slurm-dev@lists.llnl.gov</a>.</p>
 <p><a href="http://www.llnl.gov/"><img align=middle src="lll.gif" width="32" height="32" border="0"></a></p>
 <p class="footer">UCRL-WEB-204324<br>
-Last modified 3 August 2004</p></td>
+Last modified 26 August 2004</p></td>
 </tr>
 </table>
 </td>
diff --git a/doc/html/quickstart_admin.html b/doc/html/quickstart_admin.html
index 9f3aa8edb0f..dfa60425e1d 100644
--- a/doc/html/quickstart_admin.html
+++ b/doc/html/quickstart_admin.html
@@ -9,7 +9,7 @@
 <meta http-equiv="keywords" content="Simple Linux Utility for Resource Management, SLURM, resource management, 
 Linux clusters, high-performance computing, Livermore Computing">
 <meta name="LLNLRandR" content="UCRL-WEB-204324">
-<meta name="LLNLRandRdate" content="17 August 2004">
+<meta name="LLNLRandRdate" content="26 August 2004">
 <meta name="distribution" content="global">
 <meta name="description" content="Simple Linux Utility for Resource Management">
 <meta name="copyright"
@@ -150,24 +150,34 @@ nodes in the slurm.conf configuration file. SLURM presently lacks the ability
 to arbitrarily order tasks across nodes.</p> 
 
 <h4>Scheduler support</h4>
-<p>SLURM's default scheduler is FIFO (First-In First-Out). A backfill scheduler 
+<p>The scheduler used by SLURM is controled by the <b>SchedType</b> configuration 
+parameter. This is meant to control the relative importance of pending jobs.
+SLURM's default scheduler is FIFO (First-In First-Out). A backfill scheduler 
 plugin is also available. Backfill scheduling will initiate a lower-priority job 
 if doing so does not delay the expected initiation time of higher priority jobs; 
 essentially using smaller jobs to fill holes in the resource allocation plan. 
-<a href="http://supercluster.org/maui">The Maui Scheduler</a> offers sophisticated 
-scheduling algorithms to control SLURM's workload. Motivated users can even develop 
-their own scheduler plugin if so desired. </p>
-<p>SLURM uses the syslog function to record events. It uses a range of importance 
-levels for these messages. Be certain that your system's syslog functionality 
-is operational. 
-</p>
+SLURM also supports a plugin for use of <a href="http://supercluster.org/maui">
+The Maui Scheduler</a>, which offers sophisticated scheduling algorithms. 
+Motivated users can even develop their own scheduler plugin if so desired. </p>
+
+<p>SLURM also has a plugin to control the node selection algorithm for jobs.
+This is controlled by the <b>SelectType</b> configuration parameter.
+The two available options are <i>linear</i>, to support a flat interconnect, 
+and <i>bluegene</i> to support Blue Gene systems with a three-dimensional 
+topography.</p>
+
+<h4>Logging</h4>
+<p>SLURM uses the syslog function to record events. It uses a range of importance
+levels for these messages. Be certain that your system's syslog functionality
+is operational. </p>
+
 <h4>Corefile format</h4>
 <p>SLURM is designed to support generating a variety of core file formats for 
 application codes that fail (see the <i>--core</i> option of the <i>srun</i>
 command).  As of now, SLURM only supports a locally developed lightweight
 corefile library which has not yet been released to the public. It is 
-expected that this library will be available in the near future.
-</p>
+expected that this library will be available in the near future. </p>
+
 <h4>Parallel debugger support</h4>
 <p>SLURM exports information for parallel debuggers using the specification
 detailed  <a href=http://www-unix.mcs.anl.gov/mpi/mpi-debug/mpich-attach.txt>here</a>.
@@ -177,7 +187,7 @@ and support is unconditionally compiled into SLURM code.
 <p>We use a patched version of TotalView that looks for a "totalview_jobid" 
 symbol in <b>srun</b> that it then uses (configurably) to perform a bulk 
 launch of the <b>tvdsvr</b> daemons via a subsequent <b>srun</b>. Otherwise
-it is diffuclt to get TotalView to use <b>srun</b> for a bulk launch, since 
+it is difficult to get TotalView to use <b>srun</b> for a bulk launch, since 
 <b>srun</b> will be unable to determine for which job it is launching tasks.
 </p>
 <p>Another solution would be to run TotalView within an existing <b>srun</b>
@@ -186,6 +196,7 @@ could be set to ensure only a single task per node. This functions properly
 because the SLRUM_JOBID environment variable is set in the allocation shell 
 environment.
 </p>
+
 <p class="footer"><a href="#top">top</a></p>
 
 <h3>Configuration</h3>
@@ -213,7 +224,8 @@ minimum configuration values will be considered DOWN and not scheduled.</p>
 # 
 # Sample /etc/slurm.conf for mcr.llnl.gov
 #
-ControlMachine=mcri   ControlAddr=emcri 
+ControlMachine=mcri   ControlAddr=emcri
+BackupMachine=mcrj    BackupAddr=emcrj 
 #
 AuthType=auth/munge
 Epilog=/usr/local/slurm/etc/epilog
@@ -225,6 +237,7 @@ JobCredentialPublicCertificate=/usr/local/etc/slurm.cert
 PluginDir=/usr/local/slurm/lib/slurm
 Prolog=/usr/local/slurm/etc/prolog
 SchedulerType=sched/backfill
+SelectType=select/linear
 SlurmUser=slurm
 SlurmctldPort=7002
 SlurmctldTimeout=300
@@ -237,7 +250,7 @@ SwitchType=switch/elan
 # Node Configurations
 #
 NodeName=DEFAULT Procs=2 RealMemory=2000 TmpDisk=64000 State=UNKNOWN
-NodeName=mcr[0-1151]  NodeAddr=emcr[0-1151]
+NodeName=mcr[0-1151] NodeAddr=emcr[0-1151]
 #
 # Partition Configurations
 #
@@ -432,7 +445,7 @@ in the NEWS file.
 <td colspan="3"><hr> <p>For information about this page, contact <a href="mailto:slurm-dev@lists.llnl.gov">slurm-dev@lists.llnl.gov</a>.</p>
 <p><a href="http://www.llnl.gov/"><img align=middle src="lll.gif" width="32" height="32" border="0"></a></p>
 <p class="footer">UCRL-WEB-204324<br>
-Last modified 17 August 2004</p></td>
+Last modified 26 August 2004</p></td>
 </tr>
 </table>
 </td>
diff --git a/doc/html/schedplugins.html b/doc/html/schedplugins.html
index ae9d23eb05e..8093c48fc89 100644
--- a/doc/html/schedplugins.html
+++ b/doc/html/schedplugins.html
@@ -64,7 +64,9 @@ and can periodically alter job priorities to change their order within the queue
 The <b>wiki</b> scheduler establishes an initial priority of zero (held) for
 all jobs. These jobs only begin execution when the <b>wiki</b> scheduler 
 explicitly raises the their priority (releasing them). 
-Developers may use the model that best fits their needs.</p>
+Developers may use the model that best fits their needs. 
+Note that a separate <a href="selectplugins.html" class="nav">node selection plugin</a>
+is available for controlling that aspect of scheduling.</p>
 
 <p>SLURM scheduler plugins are SLURM plugins that implement the SLURM scheduler
 API described herein. They must conform to the SLURM Plugin API with the following 
diff --git a/doc/html/selectplugins.html b/doc/html/selectplugins.html
index 2d0ee973e67..042a7b64b47 100644
--- a/doc/html/selectplugins.html
+++ b/doc/html/selectplugins.html
@@ -9,7 +9,7 @@
 <meta http-equiv="keywords" content="Simple Linux Utility for Resource Management, SLURM, resource management, 
 Linux clusters, high-performance computing, Livermore Computing">
 <meta name="LLNLRandR" content="UCRL-WEB-204324"
-<meta name="LLNLRandRdate" content="25 August 2004">
+<meta name="LLNLRandRdate" content="26 August 2004">
 <meta name="distribution" content="global">
 <meta name="description" content="Simple Linux Utility for Resource Management">
 <meta name="copyright"
@@ -62,7 +62,9 @@ them. It is intended as a resource to programmers wishing to write their own SLU
 node selection plugins. This is version 0 of the API.</p>
 
 <p>SLURM node selection plugins are SLURM plugins that implement the SLURM node selection
-API described herein. They must conform to the SLURM Plugin API with the following 
+API described herein. They are intended to provide a mechanism for both selecting 
+nodes for pending jobs and performing any system-specific tasks for job launch or 
+termination. The plugins must conform to the SLURM Plugin API with the following 
 specifications:</p>
 <p><span class="commandline">const char plugin_type[]</span><br>
 The major type must be &quot;select.&quot; The minor type can be any recognizable 
@@ -178,7 +180,7 @@ the plugin should return SLURM_ERROR, causing slurmctld to exit.</p>
 
 <h4>Job-Specific Node Selection Functions</h4>
 <p class="commandline">int select_p_job_test (struct job_record *job_ptr,
-bitstr_t bitmap, int min_nodes, int max_nodes);</p>
+bitstr_t *bitmap, int min_nodes, int max_nodes);</p>
 <p style="margin-left:.2in"><b>Description</b>: Given a job's scheduling requirement 
 specification and a set of nodes which might  be used to satisfy the request, identify 
 the nodes which "best" satify the request. Note that nodes being considered for allocation 
@@ -255,7 +257,7 @@ to maintain data format compatibility across different versions of the plugin.</
 <a href="mailto:slurm-dev@lists.llnl.gov">slurm-dev@lists.llnl.gov</a>.</p>
 <p><a href="http://www.llnl.gov/"><img align=middle src="lll.gif" width="32" height="32" border="0"></a></p>
 <p class="footer">UCRL-WEB-204324<br>
-Last modified 25 August 2004</p></td>
+Last modified 26 August 2004</p></td>
 </tr>
 </table>
 </td>
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index 68a3eaea134..05f5f86535a 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -212,6 +212,19 @@ char *slurm_get_auth_type(void)
         return auth_type;
 }
 
+/* slurm_get_fast_schedule
+ * returns the value of fast_schedule in slurmctld_conf object
+ */
+extern uint16_t slurm_get_fast_schedule(void)
+{
+	uint16_t fast_val;
+
+        _lock_update_config();
+        fast_val = slurmctld_conf.fast_schedule;
+        slurm_mutex_unlock(&config_lock);
+        return fast_val;
+}
+
 /* slurm_set_auth_type
  * set the authentication type in slurmctld_conf object
  * used for security testing purposes
diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h
index 386c4181e4f..cd232664c75 100644
--- a/src/common/slurm_protocol_api.h
+++ b/src/common/slurm_protocol_api.h
@@ -119,6 +119,11 @@ extern char *slurm_get_auth_type(void);
  */
 extern int slurm_set_auth_type(char *auth_type);
 
+/* slurm_get_fast_schedule
+ * returns the value of fast_schedule in slurmctld_conf object
+ */
+extern uint16_t slurm_get_fast_schedule(void);
+
 /* slurm_get_jobcomp_type
  * returns the job completion logger type from slurmctld_conf object
  * RET char *    - job completion type,  MUST be xfreed by caller
diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c
index a22deec82b2..6508e0d75e6 100644
--- a/src/plugins/select/bluegene/select_bluegene.c
+++ b/src/plugins/select/bluegene/select_bluegene.c
@@ -115,13 +115,33 @@ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt)
 		return SLURM_ERROR;
 	}
 
-	return SLURM_SUCCESS;
+	error("select/bluegene plugin not yet functional");
+	return SLURM_ERROR;
 }
 
-extern int select_p_job_test(struct job_record *job_ptr, bitstr_t bitmap, 
+/*
+ * select_p_job_test - Given a specification of scheduling requirements, 
+ *	identify the nodes which "best" satify the request.
+ * 	"best" is defined as either single set of consecutive nodes satisfying 
+ *	the request and leaving the minimum number of unused nodes OR 
+ *	the fewest number of consecutive node sets
+ * IN job_ptr - pointer to job being scheduled
+ * IN/OUT bitmap - usable nodes are set on input, nodes not required to 
+ *	satisfy the request are cleared, other left set
+ * IN min_nodes - minimum count of nodes
+ * IN max_nodes - maximum count of nodes (0==don't care)
+ * RET zero on success, EINVAL otherwise
+ * globals (passed via select_p_node_init): 
+ *	node_record_count - count of nodes configured
+ *	node_record_table_ptr - pointer to global node table
+ * NOTE: bitmap must be a superset of req_nodes at the time that 
+ *	select_p_job_test is called
+ */
+extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 		int min_nodes, int max_nodes)
 {
-	return SLURM_SUCCESS;
+	error("select/bluegene plugin not yet functional");
+	return SLURM_ERROR;
 }
 
 extern int select_p_job_init(struct job_record *job_ptr)
diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c
index 9b9fd87b290..1a4378ec8ce 100644
--- a/src/plugins/select/linear/select_linear.c
+++ b/src/plugins/select/linear/select_linear.c
@@ -42,8 +42,13 @@
 #include <slurm/slurm_errno.h>
 
 #include "src/common/log.h"
+#include "src/common/slurm_protocol_api.h"
+#include "src/common/xassert.h"
+#include "src/common/xmalloc.h"
 #include "src/slurmctld/slurmctld.h"
 
+#define SELECT_DEBUG 0
+
 /*
  * These variables are required by the generic plugin interface.  If they
  * are not found in the plugin, the plugin loader will ignore it.
@@ -78,6 +83,20 @@ const uint32_t plugin_version	= 90;
 
 static struct node_record *select_node_ptr;
 static int select_node_cnt;
+static uint16_t select_fast_schedule;
+
+static bool 
+_enough_nodes(int avail_nodes, int rem_nodes, int min_nodes, int max_nodes)
+{
+	int needed_nodes;
+
+	if (max_nodes)
+		needed_nodes = rem_nodes + min_nodes - max_nodes;
+	else
+		needed_nodes = rem_nodes;
+
+	return(avail_nodes >= needed_nodes);
+}
 
 /*
  * init() is called when the plugin is loaded, before any other functions
@@ -122,18 +141,242 @@ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt)
 
 	select_node_ptr = node_ptr;
 	select_node_cnt = node_cnt;
+	select_fast_schedule = slurm_get_fast_schedule();
 
 	return SLURM_SUCCESS;
 }
 
-extern int select_p_job_test(struct job_record *job_ptr, bitstr_t bitmap, 
+/*
+ * select_p_job_test - Given a specification of scheduling requirements, 
+ *	identify the nodes which "best" satify the request.
+ * 	"best" is defined as either single set of consecutive nodes satisfying 
+ *	the request and leaving the minimum number of unused nodes OR 
+ *	the fewest number of consecutive node sets
+ * IN job_ptr - pointer to job being scheduled
+ * IN/OUT bitmap - usable nodes are set on input, nodes not required to 
+ *	satisfy the request are cleared, other left set
+ * IN min_nodes - minimum count of nodes
+ * IN max_nodes - maximum count of nodes (0==don't care)
+ * RET zero on success, EINVAL otherwise
+ * globals (passed via select_p_node_init): 
+ *	node_record_count - count of nodes configured
+ *	node_record_table_ptr - pointer to global node table
+ * NOTE: the job information that is considered for scheduling includes:
+ *	req_node_bitmap: bitmap of specific nodes required by the job
+ *	contiguous: allocated nodes must be sequentially located
+ *	num_procs: minimum number of processors required by the job
+ * NOTE: bitmap must be a superset of req_nodes at the time that 
+ *	select_p_job_test is called
+ */
+extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 		int min_nodes, int max_nodes)
 {
-#if 1
-/* A bunch of logic needs to be moved here from slurmctld/node_scheduler.c */
-info("in select_p_job_test(%u)",job_ptr->job_id);
+	int i, index, error_code = EINVAL, sufficient;
+	int *consec_nodes;	/* how many nodes we can add from this 
+				 * consecutive set of nodes */
+	int *consec_cpus;	/* how many nodes we can add from this 
+				 * consecutive set of nodes */
+	int *consec_start;	/* where this consecutive set starts (index) */
+	int *consec_end;	/* where this consecutive set ends (index) */
+	int *consec_req;	/* are nodes from this set required 
+				 * (in req_bitmap) */
+	int consec_index, consec_size;
+	int rem_cpus, rem_nodes;	/* remaining resources required */
+	int best_fit_nodes, best_fit_cpus, best_fit_req;
+	int best_fit_location = 0, best_fit_sufficient;
+
+	xassert(bitmap);
+
+	consec_index = 0;
+	consec_size  = 50;	/* start allocation for 50 sets of 
+				 * consecutive nodes */
+	consec_cpus  = xmalloc(sizeof(int) * consec_size);
+	consec_nodes = xmalloc(sizeof(int) * consec_size);
+	consec_start = xmalloc(sizeof(int) * consec_size);
+	consec_end   = xmalloc(sizeof(int) * consec_size);
+	consec_req   = xmalloc(sizeof(int) * consec_size);
+
+	/* Build table with information about sets of consecutive nodes */
+	consec_cpus[consec_index] = consec_nodes[consec_index] = 0;
+	consec_req[consec_index] = -1;	/* no required nodes here by default */
+	rem_cpus = job_ptr->num_procs;
+	if (max_nodes)
+		rem_nodes = max_nodes;
+	else
+		rem_nodes = min_nodes;
+	for (index = 0; index < select_node_cnt; index++) {
+		if (bit_test(bitmap, index)) {
+			if (consec_nodes[consec_index] == 0)
+				consec_start[consec_index] = index;
+			if (select_fast_schedule)
+				/* don't bother checking each node */
+				i = select_node_ptr[index].
+				    config_ptr->cpus;
+			else
+				i = select_node_ptr[index].cpus;
+			if (job_ptr->details->req_node_bitmap && 
+			    bit_test(job_ptr->details->req_node_bitmap, index)) {
+				if (consec_req[consec_index] == -1)
+					/* first required node in set */
+					consec_req[consec_index] = index;
+				rem_cpus -= i;
+				rem_nodes--;
+			} else {	 /* node not required (yet) */
+				bit_clear(bitmap, index); 
+				consec_cpus[consec_index] += i;
+				consec_nodes[consec_index]++;
+			}
+		} else if (consec_nodes[consec_index] == 0) {
+			consec_req[consec_index] = -1;
+			/* already picked up any required nodes */
+			/* re-use this record */
+		} else {
+			consec_end[consec_index] = index - 1;
+			if (++consec_index >= consec_size) {
+				consec_size *= 2;
+				xrealloc(consec_cpus,
+					 sizeof(int) * consec_size);
+				xrealloc(consec_nodes,
+					 sizeof(int) * consec_size);
+				xrealloc(consec_start,
+					 sizeof(int) * consec_size);
+				xrealloc(consec_end,
+					 sizeof(int) * consec_size);
+				xrealloc(consec_req,
+					 sizeof(int) * consec_size);
+			}
+			consec_cpus[consec_index] = 0;
+			consec_nodes[consec_index] = 0;
+			consec_req[consec_index] = -1;
+		}
+	}
+	if (consec_nodes[consec_index] != 0)
+		consec_end[consec_index++] = index - 1;
+
+#if SELECT_DEBUG
+	/* don't compile this, slows things down too much */
+	debug3("rem_cpus=%d, rem_nodes=%d", rem_cpus, rem_nodes);
+	for (i = 0; i < consec_index; i++) {
+		if (consec_req[i] != -1)
+			debug3
+			    ("start=%s, end=%s, nodes=%d, cpus=%d, req=%s",
+			     select_node_ptr[consec_start[i]].name,
+			     select_node_ptr[consec_end[i]].name,
+			     consec_nodes[i], consec_cpus[i],
+			     select_node_ptr[consec_req[i]].name);
+		else
+			debug3("start=%s, end=%s, nodes=%d, cpus=%d",
+			       select_node_ptr[consec_start[i]].name,
+			       select_node_ptr[consec_end[i]].name,
+			       consec_nodes[i], consec_cpus[i]);
+	}
 #endif
-	return SLURM_SUCCESS;
+
+	/* accumulate nodes from these sets of consecutive nodes until */
+	/*   sufficient resources have been accumulated */
+	while (consec_index) {
+		best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
+		best_fit_req = -1;	/* first required node, -1 if none */
+		for (i = 0; i < consec_index; i++) {
+			if (consec_nodes[i] == 0)
+				continue;
+			sufficient = ((consec_nodes[i] >= rem_nodes)
+				      && (consec_cpus[i] >= rem_cpus));
+
+			/* if first possibility OR */
+			/* contains required nodes OR */
+			/* first set large enough for request OR */
+			/* tightest fit (less resource waste) OR */
+			/* nothing yet large enough, but this is biggest */
+			if ((best_fit_nodes == 0) ||	
+			    ((best_fit_req == -1) && (consec_req[i] != -1)) ||
+			    (sufficient && (best_fit_sufficient == 0)) ||
+			    (sufficient && (consec_cpus[i] < best_fit_cpus)) ||	
+			    ((sufficient == 0) && 
+			     (consec_cpus[i] > best_fit_cpus))) {
+				best_fit_cpus = consec_cpus[i];
+				best_fit_nodes = consec_nodes[i];
+				best_fit_location = i;
+				best_fit_req = consec_req[i];
+				best_fit_sufficient = sufficient;
+			}
+		}
+		if (best_fit_nodes == 0)
+			break;
+		if (job_ptr->details->contiguous && 
+		    ((best_fit_cpus < rem_cpus) ||
+		     (!_enough_nodes(best_fit_nodes, rem_nodes, 
+				     min_nodes, max_nodes))))
+			break;	/* no hole large enough */
+		if (best_fit_req != -1) {
+			/* This collection of nodes includes required ones
+			 * select nodes from this set, first working up
+			 * then down from the required nodes */
+			for (i = best_fit_req;
+			     i <= consec_end[best_fit_location]; i++) {
+				if ((rem_nodes <= 0) && (rem_cpus <= 0))
+					break;
+				if (bit_test(bitmap, i))
+					continue;
+				bit_set(bitmap, i);
+				rem_nodes--;
+				if (select_fast_schedule)
+					rem_cpus -= select_node_ptr[i].
+							config_ptr->cpus;
+				else
+					rem_cpus -= select_node_ptr[i].
+							cpus;
+			}
+			for (i = (best_fit_req - 1);
+			     i >= consec_start[best_fit_location]; i--) {
+				if ((rem_nodes <= 0) && (rem_cpus <= 0))
+					break;
+				/* if (bit_test(bitmap, i)) 
+					continue;  cleared above earlier */
+				bit_set(bitmap, i);
+				rem_nodes--;
+				if (select_fast_schedule)
+					rem_cpus -= select_node_ptr[i].
+							config_ptr->cpus;
+				else
+					rem_cpus -= select_node_ptr[i].
+							cpus;
+			}
+		} else {
+			for (i = consec_start[best_fit_location];
+			     i <= consec_end[best_fit_location]; i++) {
+				if ((rem_nodes <= 0) && (rem_cpus <= 0))
+					break;
+				if (bit_test(bitmap, i))
+					continue;
+				bit_set(bitmap, i);
+				rem_nodes--;
+				if (select_fast_schedule)
+					rem_cpus -= select_node_ptr[i].
+							config_ptr->cpus;
+				else
+					rem_cpus -= select_node_ptr[i].
+							cpus;
+			}
+		}
+		if (job_ptr->details->contiguous || 
+		    ((rem_nodes <= 0) && (rem_cpus <= 0))) {
+			error_code = SLURM_SUCCESS;
+			break;
+		}
+		consec_cpus[best_fit_location] = 0;
+		consec_nodes[best_fit_location] = 0;
+	}
+	if (error_code && (rem_cpus <= 0) && 
+	    max_nodes  && ((max_nodes - rem_nodes) >= min_nodes))
+		error_code = SLURM_SUCCESS;
+
+	xfree(consec_cpus);
+	xfree(consec_nodes);
+	xfree(consec_start);
+	xfree(consec_end);
+	xfree(consec_req);
+	return error_code;
 }
 
 extern int select_p_job_init(struct job_record *job_ptr)
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 43588c59b7b..dff0575cc22 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -65,8 +65,6 @@ static void _add_node_set_info(struct node_set *node_set_ptr,
 static int  _build_node_list(struct job_record *job_ptr, 
 			     struct node_set **node_set_pptr,
 			     int *node_set_size);
-static bool _enough_nodes(int avail_nodes, int rem_nodes, int min_nodes,
-			  int max_nodes);
 static void _filter_nodes_in_set(struct node_set *node_set_ptr,
 				 struct job_details *detail_ptr);
 static int _match_feature(char *seek, char *available);
@@ -76,18 +74,13 @@ static int _nodes_in_sets(bitstr_t *req_bitmap,
 static void _node_load_bitmaps(bitstr_t * bitmap, bitstr_t ** no_load_bit, 
 				bitstr_t ** light_load_bit, 
 				bitstr_t ** heavy_load_bit);
-static int _pick_best_layout(bitstr_t * bitmap, bitstr_t * req_bitmap,
-			     int min_nodes, int max_nodes, int req_cpus,
-			     int consecutive);
-static int _pick_best_load(bitstr_t * bitmap, bitstr_t * req_bitmap,
-			   int min_nodes, int max_nodes, int req_cpus,
-			   int consecutive);
-static int _pick_best_nodes(struct node_set *node_set_ptr,
-			    int node_set_size, bitstr_t ** req_bitmap,
-			    uint32_t req_cpus, 
-			    uint32_t min_nodes, uint32_t max_nodes,
-			    int contiguous, int shared,
-			    uint32_t node_lim);
+static int _pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap, 
+			int min_nodes, int max_nodes);
+static int _pick_best_nodes(struct node_set *node_set_ptr, 
+			int node_set_size, bitstr_t ** select_bitmap, 
+			struct job_record *job_ptr, uint32_t min_nodes, 
+			uint32_t max_nodes, int shared, 
+			uint32_t node_lim);
 static int _valid_features(char *requested, char *available);
 
 
@@ -250,248 +243,15 @@ static int _match_feature(char *seek, char *available)
 }
 
 
-/*
- * _pick_best_layout - Given a specification of scheduling requirements, 
- *	identify the nodes which "best" satify the request.
- * 	"best" is defined as either single set of consecutive nodes satisfying 
- *	the request and leaving the minimum number of unused nodes OR 
- *	the fewest number of consecutive node sets
- * IN/OUT bitmap - usable nodes are set on input, nodes not required to 
- *	satisfy the request are cleared, other left set
- * IN req_bitmap - map of required nodes
- * IN min_nodes - minimum count of nodes
- * IN max_nodes - maximum count of nodes (0==don't care)
- * IN req_cpus - count of required processors
- * IN consecutive - allocated nodes must be consecutive if set
- * RET zero on success, EINVAL otherwise
- * globals: node_record_count - count of nodes configured
- *	node_record_table_ptr - pointer to global node table
- * NOTE: bitmap must be a superset of req_nodes at the time that 
- *	_pick_best_layout is called
- */
-static int
-_pick_best_layout(bitstr_t * bitmap, bitstr_t * req_bitmap,
-		    int min_nodes, int max_nodes, 
-		    int req_cpus, int consecutive)
-{
-	int i, index, error_code = EINVAL, sufficient;
-	int *consec_nodes;	/* how many nodes we can add from this 
-				 * consecutive set of nodes */
-	int *consec_cpus;	/* how many nodes we can add from this 
-				 * consecutive set of nodes */
-	int *consec_start;	/* where this consecutive set starts (index) */
-	int *consec_end;	/* where this consecutive set ends (index) */
-	int *consec_req;	/* are nodes from this set required 
-				 * (in req_bitmap) */
-	int consec_index, consec_size;
-	int rem_cpus, rem_nodes;	/* remaining resources required */
-	int best_fit_nodes, best_fit_cpus, best_fit_req;
-	int best_fit_location = 0, best_fit_sufficient;
-
-	xassert(bitmap);
-
-	consec_index = 0;
-	consec_size  = 50;	/* start allocation for 50 sets of 
-				 * consecutive nodes */
-	consec_cpus  = xmalloc(sizeof(int) * consec_size);
-	consec_nodes = xmalloc(sizeof(int) * consec_size);
-	consec_start = xmalloc(sizeof(int) * consec_size);
-	consec_end   = xmalloc(sizeof(int) * consec_size);
-	consec_req   = xmalloc(sizeof(int) * consec_size);
-
-	/* Build table with information about sets of consecutive nodes */
-	consec_cpus[consec_index] = consec_nodes[consec_index] = 0;
-	consec_req[consec_index] = -1;	/* no required nodes here by default */
-	rem_cpus = req_cpus;
-	if (max_nodes)
-		rem_nodes = max_nodes;
-	else
-		rem_nodes = min_nodes;
-	for (index = 0; index < node_record_count; index++) {
-		if (bit_test(bitmap, index)) {
-			if (consec_nodes[consec_index] == 0)
-				consec_start[consec_index] = index;
-			if (slurmctld_conf.fast_schedule)	
-				/* don't bother checking each node */
-				i = node_record_table_ptr[index].
-				    config_ptr->cpus;
-			else
-				i = node_record_table_ptr[index].cpus;
-			if (req_bitmap && bit_test(req_bitmap, index)) {
-				if (consec_req[consec_index] == -1)
-					/* first required node in set */
-					consec_req[consec_index] = index;
-				rem_cpus -= i;
-				rem_nodes--;
-			} else {	 /* node not required (yet) */
-				bit_clear(bitmap, index); 
-				consec_cpus[consec_index] += i;
-				consec_nodes[consec_index]++;
-			}
-		} else if (consec_nodes[consec_index] == 0) {
-			consec_req[consec_index] = -1;
-			/* already picked up any required nodes */
-			/* re-use this record */
-		} else {
-			consec_end[consec_index] = index - 1;
-			if (++consec_index >= consec_size) {
-				consec_size *= 2;
-				xrealloc(consec_cpus,
-					 sizeof(int) * consec_size);
-				xrealloc(consec_nodes,
-					 sizeof(int) * consec_size);
-				xrealloc(consec_start,
-					 sizeof(int) * consec_size);
-				xrealloc(consec_end,
-					 sizeof(int) * consec_size);
-				xrealloc(consec_req,
-					 sizeof(int) * consec_size);
-			}
-			consec_cpus[consec_index] = 0;
-			consec_nodes[consec_index] = 0;
-			consec_req[consec_index] = -1;
-		}
-	}
-	if (consec_nodes[consec_index] != 0)
-		consec_end[consec_index++] = index - 1;
-
-#ifdef EXTREME_DEBUG
-	/* don't compile this, slows things down too much */
-	debug3("rem_cpus=%d, rem_nodes=%d", rem_cpus, rem_nodes);
-	for (i = 0; i < consec_index; i++) {
-		if (consec_req[i] != -1)
-			debug3
-			    ("start=%s, end=%s, nodes=%d, cpus=%d, req=%s",
-			     node_record_table_ptr[consec_start[i]].name,
-			     node_record_table_ptr[consec_end[i]].name,
-			     consec_nodes[i], consec_cpus[i],
-			     node_record_table_ptr[consec_req[i]].name);
-		else
-			debug3("start=%s, end=%s, nodes=%d, cpus=%d",
-			       node_record_table_ptr[consec_start[i]].name,
-			       node_record_table_ptr[consec_end[i]].name,
-			       consec_nodes[i], consec_cpus[i]);
-	}
-#endif
-
-	/* accumulate nodes from these sets of consecutive nodes until */
-	/*   sufficient resources have been accumulated */
-	while (consec_index) {
-		best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
-		best_fit_req = -1;	/* first required node, -1 if none */
-		for (i = 0; i < consec_index; i++) {
-			if (consec_nodes[i] == 0)
-				continue;
-			sufficient = ((consec_nodes[i] >= rem_nodes)
-				      && (consec_cpus[i] >= rem_cpus));
-
-			/* if first possibility OR */
-			/* contains required nodes OR */
-			/* first set large enough for request OR */
-			/* tightest fit (less resource waste) OR */
-			/* nothing yet large enough, but this is biggest */
-			if ((best_fit_nodes == 0) ||	
-			    ((best_fit_req == -1) && (consec_req[i] != -1)) ||
-			    (sufficient && (best_fit_sufficient == 0)) ||
-			    (sufficient && (consec_cpus[i] < best_fit_cpus)) ||	
-			    ((sufficient == 0) && 
-			     (consec_cpus[i] > best_fit_cpus))) {
-				best_fit_cpus = consec_cpus[i];
-				best_fit_nodes = consec_nodes[i];
-				best_fit_location = i;
-				best_fit_req = consec_req[i];
-				best_fit_sufficient = sufficient;
-			}
-		}
-		if (best_fit_nodes == 0)
-			break;
-		if (consecutive && 
-		    ((best_fit_cpus < rem_cpus) ||
-		     (!_enough_nodes(best_fit_nodes, rem_nodes, 
-				     min_nodes, max_nodes))))
-			break;	/* no hole large enough */
-		if (best_fit_req != -1) {
-			/* This collection of nodes includes required ones
-			 * select nodes from this set, first working up
-			 * then down from the required nodes */
-			for (i = best_fit_req;
-			     i <= consec_end[best_fit_location]; i++) {
-				if ((rem_nodes <= 0) && (rem_cpus <= 0))
-					break;
-				if (bit_test(bitmap, i))
-					continue;
-				bit_set(bitmap, i);
-				rem_nodes--;
-				if (slurmctld_conf.fast_schedule)
-					rem_cpus -= node_record_table_ptr[i].
-							config_ptr->cpus;
-				else
-					rem_cpus -= node_record_table_ptr[i].
-							cpus;
-			}
-			for (i = (best_fit_req - 1);
-			     i >= consec_start[best_fit_location]; i--) {
-				if ((rem_nodes <= 0) && (rem_cpus <= 0))
-					break;
-				/* if (bit_test(bitmap, i)) 
-					continue;  cleared above earlier */
-				bit_set(bitmap, i);
-				rem_nodes--;
-				if (slurmctld_conf.fast_schedule)
-					rem_cpus -= node_record_table_ptr[i].
-							config_ptr->cpus;
-				else
-					rem_cpus -= node_record_table_ptr[i].
-							cpus;
-			}
-		} else {
-			for (i = consec_start[best_fit_location];
-			     i <= consec_end[best_fit_location]; i++) {
-				if ((rem_nodes <= 0) && (rem_cpus <= 0))
-					break;
-				if (bit_test(bitmap, i))
-					continue;
-				bit_set(bitmap, i);
-				rem_nodes--;
-				if (slurmctld_conf.fast_schedule)
-					rem_cpus -= node_record_table_ptr[i].
-							config_ptr->cpus;
-				else
-					rem_cpus -= node_record_table_ptr[i].
-							cpus;
-			}
-		}
-		if (consecutive || 
-		    ((rem_nodes <= 0) && (rem_cpus <= 0))) {
-			error_code = SLURM_SUCCESS;
-			break;
-		}
-		consec_cpus[best_fit_location] = 0;
-		consec_nodes[best_fit_location] = 0;
-	}
-	if (error_code && (rem_cpus <= 0) && 
-	    max_nodes  && ((max_nodes - rem_nodes) >= min_nodes))
-		error_code = SLURM_SUCCESS;
-
-	xfree(consec_cpus);
-	xfree(consec_nodes);
-	xfree(consec_start);
-	xfree(consec_end);
-	xfree(consec_req);
-	return error_code;
-}
-
 /*
  * _pick_best_load - Given a specification of scheduling requirements, 
  *	identify the nodes which "best" satify the request.
  * 	"best" is defined as the least loaded nodes
+ * IN job_ptr - pointer to job being scheduled
  * IN/OUT bitmap - usable nodes are set on input, nodes not required to 
  *	satisfy the request are cleared, other left set
- * IN req_bitmap - map of required nodes
  * IN min_nodes - minimum count of nodes
  * IN max_nodes - maximum count of nodes (0==don't care)
- * IN req_cpus - count of required processors
- * IN consecutive - allocated nodes must be consecutive if set
  * RET zero on success, EINVAL otherwise
  * globals: node_record_count - count of nodes configured
  *	node_record_table_ptr - pointer to global node table
@@ -499,9 +259,8 @@ _pick_best_layout(bitstr_t * bitmap, bitstr_t * req_bitmap,
  *	_pick_best_load is called
  */
 static int
-_pick_best_load(bitstr_t * bitmap, bitstr_t * req_bitmap,
-		int min_nodes, int max_nodes, 
-		int req_cpus, int consecutive)
+_pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap, 
+		int min_nodes, int max_nodes)
 {
 	bitstr_t *no_load_bit, *light_load_bit, *heavy_load_bit;
 	int error_code;
@@ -512,22 +271,22 @@ _pick_best_load(bitstr_t * bitmap, bitstr_t * req_bitmap,
 	/* first try to use idle nodes */
 	bit_and(bitmap, no_load_bit);
 	FREE_NULL_BITMAP(no_load_bit);
-	error_code = _pick_best_layout(bitmap, req_bitmap, min_nodes, 
-				max_nodes, req_cpus, consecutive);
+	error_code = select_g_job_test(job_ptr, bitmap, 
+			min_nodes, max_nodes);
 
 	/* now try to use idle and lightly loaded nodes */
 	if (error_code) {
 		bit_or(bitmap, light_load_bit);
-		error_code = _pick_best_layout(bitmap, req_bitmap, min_nodes, 
-				max_nodes, req_cpus, consecutive);
+		error_code = select_g_job_test(job_ptr, bitmap, 
+				min_nodes, max_nodes);
 	} 
 	FREE_NULL_BITMAP(light_load_bit);
 
 	/* now try to use all possible nodes */
 	if (error_code) {
 		bit_or(bitmap, heavy_load_bit);
-		error_code = _pick_best_layout(bitmap, req_bitmap, min_nodes, 
-				max_nodes, req_cpus, consecutive);
+		error_code = select_g_job_test(job_ptr, bitmap, 
+				min_nodes, max_nodes);
 	}
 	FREE_NULL_BITMAP(heavy_load_bit);
 
@@ -573,31 +332,16 @@ _node_load_bitmaps(bitstr_t * bitmap, bitstr_t ** no_load_bit,
 	*heavy_load_bit = bitmap2;
 }
 
-static bool 
-_enough_nodes(int avail_nodes, int rem_nodes, int min_nodes, int max_nodes)
-{
-	int needed_nodes;
-
-	if (max_nodes)
-		needed_nodes = rem_nodes + min_nodes - max_nodes;
-	else
-		needed_nodes = rem_nodes;
-
-	return(avail_nodes >= needed_nodes);
-}
-
 
 /*
  * _pick_best_nodes - from a weigh order list of all nodes satisfying a 
  *	job's specifications, select the "best" for use
  * IN node_set_ptr - pointer to node specification information
  * IN node_set_size - number of entries in records pointed to by node_set_ptr
- * IN/OUT req_bitmap - pointer to bitmap of specific nodes required by the 
- *	job, could be NULL, returns bitmap of selected nodes, must xfree
- * IN req_cpus - count of cpus required by the job
+ * OUT select_bitmap - returns bitmap of selected nodes, must FREE_NULL_BITMAP
+ * IN job_ptr - pointer to job being scheduled
  * IN min_nodes - minimum count of nodes required by the job
  * IN max_nodes - maximum count of nodes required by the job (0==no limit)
- * IN contiguous - 1 if allocated nodes must be contiguous, 0 otherwise
  * IN shared - set to 1 if nodes may be shared, 0 otherwise
  * IN node_lim - maximum number of nodes permitted for job, 
  *	INFINITE for no limit (partition limit)
@@ -607,7 +351,7 @@ _enough_nodes(int avail_nodes, int rem_nodes, int min_nodes, int max_nodes)
  *	be satisfied , or
  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE if the job can not be 
  *	initiated until the parition's configuration changes
- * NOTE: the caller must xfree memory pointed to by req_bitmap
+ * NOTE: the caller must FREE_NULL_BITMAP memory pointed to by select_bitmap
  * Notes: The algorithm is
  *	1) If required node list is specified, determine implicitly required
  *	   processor and node count 
@@ -616,18 +360,19 @@ _enough_nodes(int avail_nodes, int rem_nodes, int min_nodes, int max_nodes)
  *	3) For each feature: find matching node table entries, identify nodes 
  *	   that are up and available (idle or shared) and add them to a bit 
  *	   map
- *	4) If nodes _not_ shared then call _pick_best_layout() to select the 
+ *	4) If nodes _not_ shared then call select_g_job_test() to select the 
  *	   "best" of those based upon topology, else call _pick_best_load()
  *	   to pick the "best" nodes in terms of workload
- *	5) If request can't be satified now, execute _pick_best_layout() 
- *	   against the list of nodes that exist in any state (perhaps down 
- *	   or busy) to determine if the request can ever be satified.
+ *	5) If request can't be satified now, execute select_g_job_test() 
+ *	   against the list of nodes that exist in any state (perhaps DOWN 
+ *	   DRAINED or ALLOCATED) to determine if the request can
+ *         ever be satified.
  */
 static int
 _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
-		 bitstr_t ** req_bitmap, uint32_t req_cpus,
+		 bitstr_t ** select_bitmap, struct job_record *job_ptr,
 		 uint32_t min_nodes, uint32_t max_nodes,
-		 int contiguous, int shared, uint32_t node_lim)
+		 int shared, uint32_t node_lim)
 {
 	int error_code = SLURM_SUCCESS, i, j, pick_code;
 	int total_nodes = 0, total_cpus = 0;	/* total resources configured 
@@ -639,18 +384,25 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
 	bool runable_ever  = false;	/* Job can ever run */
 	bool runable_avail = false;	/* Job can run with available nodes */
 
+#ifdef HAVE_BGL
+	if (shared) {
+		error("attempt to share Blue Gene nodes");
+		shared = 0;
+	}
+#endif
+
 	if (node_set_size == 0) {
 		info("_pick_best_nodes: empty node set for selection");
 		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
 	}
 
-	if (*req_bitmap) {	/* specific nodes required */
+	if (job_ptr->details->req_node_bitmap) {	/* specific nodes required */
 		/* we have already confirmed that all of these nodes have a
 		 * usable configuration and are in the proper partition */
 		if (min_nodes != 0)
-			total_nodes = bit_set_count(*req_bitmap);
-		if (req_cpus != 0)
-			total_cpus = count_cpus(*req_bitmap);
+			total_nodes = bit_set_count(job_ptr->details->req_node_bitmap);
+		if (job_ptr->num_procs != 0)
+			total_cpus = count_cpus(job_ptr->details->req_node_bitmap);
 		if ((max_nodes != 0) &&
 		    (total_nodes > max_nodes)) {
 			info("_pick_best_nodes: required nodes exceed limit");
@@ -662,20 +414,22 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
 		}
 		if ((min_nodes <= total_nodes) && 
 		    (max_nodes <= min_nodes  ) &&
-		    (req_cpus  <= total_cpus )) {
-			if (!bit_super_set(*req_bitmap, avail_node_bitmap))
+		    (job_ptr->num_procs <= total_cpus )) {
+			if (!bit_super_set(job_ptr->details->req_node_bitmap, 
+					avail_node_bitmap))
 				return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
 			if (shared) {
-				if (!bit_super_set(*req_bitmap, 
+				if (!bit_super_set(job_ptr->details->req_node_bitmap, 
 							share_node_bitmap))
 					return ESLURM_NODES_BUSY;
 			} else {
-				if (!bit_super_set(*req_bitmap, 
+				if (!bit_super_set(job_ptr->details->req_node_bitmap, 
 							idle_node_bitmap))
 					return ESLURM_NODES_BUSY;
 			}
-			return SLURM_SUCCESS;	/* user can have selected 
-						 * nodes, we're done! */
+			/* still must go through select_g_job_test() to 
+			 * determine validity of request and/or perform
+			 * set-up before job launch */
 		}
 		total_nodes = total_cpus = 0;	/* reinitialize */
 	}
@@ -708,25 +462,21 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
 				bit_set_count(node_set_ptr[i].my_bitmap);
 			_add_node_set_info(&node_set_ptr[i], &avail_bitmap, 
 					   &avail_nodes, &avail_cpus);
-			if ((*req_bitmap) &&
-			    (!bit_super_set(*req_bitmap, avail_bitmap)))
+			if ((job_ptr->details->req_node_bitmap) &&
+			    (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap)))
 				continue;
 			if ((avail_nodes  < min_nodes) ||
-			    (avail_cpus   < req_cpus) ||
+			    (avail_cpus   < job_ptr->num_procs) ||
 			    ((max_nodes   > min_nodes) && 
 			     (avail_nodes < max_nodes)))
 				continue;	/* Keep accumulating nodes */
 
-			if (shared)
-				pick_code = _pick_best_load(avail_bitmap, 
-							*req_bitmap, min_nodes,
-							max_nodes, req_cpus, 
-							contiguous);
-			else
-				pick_code = _pick_best_layout(avail_bitmap, 
-							*req_bitmap, min_nodes,
-							max_nodes, req_cpus, 
-							contiguous);
+			if (shared) {
+				pick_code = _pick_best_load(job_ptr, avail_bitmap, 
+							min_nodes, max_nodes);
+			} else
+				pick_code = select_g_job_test(job_ptr, avail_bitmap, 
+						min_nodes, max_nodes);
 
 			if (pick_code == SLURM_SUCCESS) {
 				if ((node_lim != INFINITE) && 
@@ -736,8 +486,7 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
 					break;
 				}
 				FREE_NULL_BITMAP(total_bitmap);
-				FREE_NULL_BITMAP(*req_bitmap);
-				*req_bitmap = avail_bitmap;
+				*select_bitmap = avail_bitmap;
 				return SLURM_SUCCESS;
 			}
 		}
@@ -746,16 +495,13 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
 		if ((max_nodes   >  min_nodes) && 
 		    (avail_nodes >= min_nodes) &&
 		    (avail_nodes <  max_nodes)) {
-			pick_code = _pick_best_layout(avail_bitmap, 
-							*req_bitmap, min_nodes,
-							max_nodes, req_cpus, 
-							contiguous);
+			pick_code = select_g_job_test(job_ptr, avail_bitmap, 
+						min_nodes, max_nodes);
 			if ((pick_code == SLURM_SUCCESS) &&
 			    ((node_lim == INFINITE) ||
 			     (bit_set_count(avail_bitmap) <= node_lim))) {
 				FREE_NULL_BITMAP(total_bitmap);
-				FREE_NULL_BITMAP(*req_bitmap);
-				*req_bitmap = avail_bitmap;
+				*select_bitmap = avail_bitmap;
 				return SLURM_SUCCESS;
 			}
 		}
@@ -763,20 +509,18 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
 		/* determine if job could possibly run (if all configured 
 		 * nodes available) */
 		if ((!runable_ever || !runable_avail) &&
-		    (total_nodes >= min_nodes) && (total_cpus >= req_cpus) &&
-		    ((*req_bitmap == NULL) ||
-		     (bit_super_set(*req_bitmap, total_bitmap)))) {
+		    (total_nodes >= min_nodes) && 
+		    (total_cpus >= job_ptr->num_procs) &&
+		    ((job_ptr->details->req_node_bitmap == NULL) ||
+		     (bit_super_set(job_ptr->details->req_node_bitmap, total_bitmap)))) {
 			if (!runable_avail) {
 				FREE_NULL_BITMAP(avail_bitmap);
 				avail_bitmap = bit_copy(total_bitmap);
 				if (avail_bitmap == NULL)
 					fatal("bit_copy malloc failure");
 				bit_and(avail_bitmap, avail_node_bitmap);
-				pick_code = _pick_best_layout(
-							avail_bitmap, 
-							*req_bitmap, min_nodes,
-							max_nodes, req_cpus, 
-							contiguous);
+				pick_code = select_g_job_test(job_ptr, avail_bitmap, 
+						min_nodes, max_nodes);
 				if (pick_code == SLURM_SUCCESS) {
 					runable_ever  = true;
 					if ((node_lim == INFINITE) ||
@@ -786,11 +530,8 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
 				}
 			}
 			if (!runable_ever) {
-				pick_code = _pick_best_layout(
-							total_bitmap, 
-							*req_bitmap, min_nodes,
-							max_nodes, req_cpus, 
-							contiguous);
+				pick_code = select_g_job_test(job_ptr, total_bitmap, 
+						min_nodes, max_nodes);
 				if (pick_code == SLURM_SUCCESS)
 					runable_ever = true;
 			}
@@ -859,7 +600,7 @@ _add_node_set_info(struct node_set *node_set_ptr,
 int select_nodes(struct job_record *job_ptr, bool test_only)
 {
 	int error_code = SLURM_SUCCESS, i, shared, node_set_size = 0;
-	bitstr_t *req_bitmap = NULL;
+	bitstr_t *select_bitmap = NULL;
 	struct job_details *detail_ptr = job_ptr->details;
 	struct node_set *node_set_ptr = NULL;
 	struct part_record *part_ptr = job_ptr->part_ptr;
@@ -915,12 +656,9 @@ int select_nodes(struct job_record *job_ptr, bool test_only)
 			     job_ptr->job_id);
 			goto cleanup;
 		}
-		req_bitmap = bit_copy(job_ptr->details->req_node_bitmap);
-		if (req_bitmap == NULL)
-			fatal("bit_copy malloc failure");
 	}
 
-	/* pick the nodes providing a best-fit */
+	/* enforce both user's and partition's node limits*/
 	if (super_user) {
 		min_nodes = job_ptr->details->min_nodes;
 		part_node_limit = INFINITE;
@@ -944,9 +682,8 @@ int select_nodes(struct job_record *job_ptr, bool test_only)
 		shared = job_ptr->details->shared;
 
 	error_code = _pick_best_nodes(node_set_ptr, node_set_size,
-				      &req_bitmap, job_ptr->num_procs,
+				      &select_bitmap, job_ptr,
 				      min_nodes, max_nodes,
-				      job_ptr->details->contiguous, 
 				      shared, part_node_limit);
 	if (error_code) {
 		if (detail_ptr)
@@ -977,12 +714,12 @@ int select_nodes(struct job_record *job_ptr, bool test_only)
 	/* assign the nodes and stage_in the job */
 	if (detail_ptr)
 		detail_ptr->wait_reason = WAIT_NO_REASON;
-	job_ptr->nodes = bitmap2node_name(req_bitmap);
-	job_ptr->node_bitmap = req_bitmap;
+	job_ptr->nodes = bitmap2node_name(select_bitmap);
+	job_ptr->node_bitmap = select_bitmap;
 	job_ptr->details->shared = shared;
+	select_bitmap = NULL;	/* nothing left to free */
 	allocate_nodes(job_ptr);
 	build_node_details(job_ptr);
-	req_bitmap = NULL;
 	job_ptr->job_state = JOB_RUNNING;
 	job_ptr->start_time = job_ptr->time_last_active = time(NULL);
 	if (job_ptr->time_limit == NO_VAL)
@@ -995,7 +732,7 @@ int select_nodes(struct job_record *job_ptr, bool test_only)
 				    (job_ptr->time_limit * 60);   /* secs */
 
       cleanup:
-	FREE_NULL_BITMAP(req_bitmap);
+	FREE_NULL_BITMAP(select_bitmap);
 	if (node_set_ptr) {
 		for (i = 0; i < node_set_size; i++)
 			FREE_NULL_BITMAP(node_set_ptr[i].my_bitmap);
@@ -1155,6 +892,7 @@ static void _filter_nodes_in_set(struct node_set *node_set_ptr,
 }
 
 /*
+ * _nodes_in_sets - Determine if required nodes are included in node_set(s)
  * IN req_bitmap - nodes specifically required by the job 
  * IN node_set_ptr - sets of valid nodes
  * IN node_set_size - count of node_set entries
diff --git a/src/slurmctld/select_plugin.c b/src/slurmctld/select_plugin.c
index bbeadaa7416..b0da617c8ba 100644
--- a/src/slurmctld/select_plugin.c
+++ b/src/slurmctld/select_plugin.c
@@ -46,8 +46,7 @@ typedef struct slurm_select_ops {
 	int 		(*node_init)		( struct node_record *node_ptr,
 						  int node_cnt);
 	int		(*job_test)		( struct job_record *job_ptr,
-						  bitstr_t bitmap, 
-						  int min_nodes, 
+						  bitstr_t *bitmap, int min_nodes, 
 						  int max_nodes );
 	int		(*job_init)		( struct job_record *job_ptr );
 	int		(*job_fini)		( struct job_record *job_ptr );
@@ -239,7 +238,7 @@ extern int select_g_node_init(struct node_record *node_ptr, int node_cnt)
 /*
  * Select the "best" nodes for given job from those available
  */
-extern int select_g_job_test(struct job_record *job_ptr, bitstr_t bitmap,
+extern int select_g_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
         int min_nodes, int max_nodes)
 {
 	if (slurm_select_init() < 0)
diff --git a/src/slurmctld/select_plugin.h b/src/slurmctld/select_plugin.h
index c6ac6c34722..70c1988ebe6 100644
--- a/src/slurmctld/select_plugin.h
+++ b/src/slurmctld/select_plugin.h
@@ -53,7 +53,7 @@ extern int select_g_node_init(struct node_record *node_ptr, int node_cnt);
 /*
  * Select the "best" nodes for given job from those available
  */
-extern int select_g_job_test(struct job_record *job_ptr, bitstr_t bitmap,
+extern int select_g_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 	int min_nodes, int max_nodes);
 
 /*
-- 
GitLab