Skip to content
Snippets Groups Projects
node_scheduler.c 62.7 KiB
Newer Older
/*****************************************************************************\
 *  node_scheduler.c - select and allocated nodes to jobs 
 *	Note: there is a global node table (node_record_table_ptr) 
 *****************************************************************************
 *  Copyright (C) 2002-2006 The Regents of the University of California.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Morris Jette <jette1@llnl.gov>
 *  
 *  This file is part of SLURM, a resource management program.
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 *  
 *  SLURM is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission 
 *  to link the code of portions of this program with the OpenSSL library under
 *  certain conditions as described in each individual source file, and 
 *  distribute linked combinations including the two. You must obey the GNU 
 *  General Public License in all respects for all of the code used other than 
 *  OpenSSL. If you modify file(s) with this exception, you may extend this 
 *  exception to your version of the file(s), but you are not obligated to do 
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in 
 *  the program, then also delete it here.
 *  
 *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *  
 *  You should have received a copy of the GNU General Public License along
 *  with SLURM; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/
#ifdef HAVE_SYS_SYSLOG_H
#  include <sys/syslog.h>
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <slurm/slurm_errno.h>

#include "src/common/hostlist.h"
#include "src/common/list.h"
#include "src/common/node_select.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/sched_plugin.h"
#include "src/slurmctld/slurmctld.h"
#define MAX_FEATURES  32	/* max exclusive features "[fs1|fs2]"=2 */
#define MAX_RETRIES   10
struct node_set {		/* set of nodes with same configuration */
	uint32_t cpus_per_node;	/* NOTE: This is the minimum count,
				 * if FastSchedule==0 then individual 
				 * nodes within the same configuration 
				 * line (in slurm.conf) can actually 
				 * have different CPU counts */
	uint32_t real_memory;
	uint32_t nodes;
	uint32_t weight;
Moe Jette's avatar
Moe Jette committed
};

static int _add_node_set_info(struct node_set *node_set_ptr, 
			      bitstr_t ** node_bitmap, 
			      int *node_cnt, int *cpu_cnt, 
			      const int mem_cnt, int cr_enabled,
static int  _build_feature_list(struct job_record *job_ptr);
static int  _build_node_list(struct job_record *job_ptr, 
			     struct node_set **node_set_pptr,
			     int *node_set_size);
static void _feature_list_delete(void *x);
static void _filter_nodes_in_set(struct node_set *node_set_ptr,
				 struct job_details *detail_ptr);
static int _match_feature(char *seek, char *available);
static int _nodes_in_sets(bitstr_t *req_bitmap, 
			  struct node_set * node_set_ptr, 
			  int node_set_size);
static int _pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap, 
			   uint32_t min_nodes, uint32_t max_nodes, 
			   uint32_t req_nodes, bool test_only);
static int _job_count_bitmap(bitstr_t * bitmap, bitstr_t * jobmap,
			     int job_cnt); 
static int _pick_best_nodes(struct node_set *node_set_ptr,
			    int node_set_size, bitstr_t ** select_bitmap,
			    struct job_record *job_ptr,
			    struct part_record *part_ptr,
			    uint32_t min_nodes, uint32_t max_nodes,
			    uint32_t req_nodes);
static void _print_feature_list(uint32_t job_id, List feature_list);
static bitstr_t *_valid_features(struct job_details *detail_ptr, 
				 char *available);
/*
 * allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
 * IN job_ptr - job being allocated resources
 * globals: node_record_count - number of nodes in the system
 *	node_record_table_ptr - pointer to global node table
 *	last_node_update - last update time of node table
extern void allocate_nodes(struct job_record *job_ptr)
	last_node_update = time(NULL);
	for (i = 0; i < node_record_count; i++) {
		if (bit_test(job_ptr->node_bitmap, i))
			make_node_alloc(&node_record_table_ptr[i], job_ptr);
 * count_cpus - report how many cpus are associated with the identified nodes 
 * IN bitmap - map of nodes to tally
 * RET cpu count
 * globals: node_record_count - number of nodes configured
 *	node_record_table_ptr - pointer to global node table
Moe Jette's avatar
Moe Jette committed
 */
Danny Auble's avatar
Danny Auble committed
extern int count_cpus(bitstr_t *bitmap)
Moe Jette's avatar
Moe Jette committed

	for (i = 0; i < node_record_count; i++) {
		if (bit_test(bitmap, i) != 1)
		if (slurmctld_conf.fast_schedule)
			sum += node_record_table_ptr[i].config_ptr->cpus;
		else
			sum += node_record_table_ptr[i].cpus;
/*
 * deallocate_nodes - for a given job, deallocate its nodes and make 
 * IN job_ptr - pointer to terminating job (already in some COMPLETING state)
 * IN timeout - true if job exhausted time limit, send REQUEST_KILL_TIMELIMIT
 *	RPC instead of REQUEST_TERMINATE_JOB
 * IN suspended - true if job was already suspended (node's job_run_cnt 
 *	already decremented);
 * globals: node_record_count - number of nodes in the system
 *	node_record_table_ptr - pointer to global node table
 */
extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, 
	kill_job_msg_t *kill_job = NULL;
	agent_arg_t *agent_args = NULL;
	int down_node_cnt = 0;
	xassert(job_ptr);
	xassert(job_ptr->details);
	if (select_g_job_fini(job_ptr) != SLURM_SUCCESS)
		error("select_g_job_fini(%u): %m", job_ptr->job_id);

	agent_args = xmalloc(sizeof(agent_arg_t));
	if (timeout)
		agent_args->msg_type = REQUEST_KILL_TIMELIMIT;
	else
		agent_args->msg_type = REQUEST_TERMINATE_JOB;
	agent_args->retry = 0;	/* re_kill_job() resends as needed */
	agent_args->hostlist = hostlist_create("");
	kill_job = xmalloc(sizeof(kill_job_msg_t));
	last_node_update = time(NULL);
	kill_job->job_id  = job_ptr->job_id;
	kill_job->job_uid = job_ptr->user_id;
	kill_job->nodes   = xstrdup(job_ptr->nodes);
	kill_job->time    = time(NULL);
	kill_job->select_jobinfo = select_g_copy_jobinfo(
			job_ptr->select_jobinfo);
	for (i = 0; i < node_record_count; i++) {
		struct node_record *node_ptr = &node_record_table_ptr[i];
		if (bit_test(job_ptr->node_bitmap, i) == 0)
		base_state = node_ptr->node_state & NODE_STATE_BASE;
		if (base_state == NODE_STATE_DOWN) {
			/* Issue the KILL RPC, but don't verify response */
			down_node_cnt++;
			bit_clear(job_ptr->node_bitmap, i);
		make_node_comp(node_ptr, job_ptr, suspended);
#ifdef HAVE_FRONT_END		/* Operate only on front-end */
		if (agent_args->node_count > 0)
			continue;
#endif
		hostlist_push(agent_args->hostlist, node_ptr->name);
	if ((agent_args->node_count - down_node_cnt) == 0) {
		job_ptr->job_state &= (~JOB_COMPLETING);
		delete_step_records(job_ptr, 1);
	if (agent_args->node_count == 0) {
		error("Job %u allocated no nodes to be killed on",
		xfree(kill_job->nodes);
		select_g_free_jobinfo(&kill_job->select_jobinfo);
	agent_args->msg_args = kill_job;
 * _match_feature - determine if the desired feature is one of those available
 * IN seek - desired feature
Moe Jette's avatar
Moe Jette committed
 * IN available - comma separated list of available features
 * RET 1 if found, 0 otherwise
static int _match_feature(char *seek, char *available)
	char *tmp_available = NULL, *str_ptr3 = NULL, *str_ptr4 = NULL;
	if (seek == NULL)
		return 1;	/* nothing to look for */
	if (available == NULL)
		return SLURM_SUCCESS;	/* nothing to find */
	tmp_available = xstrdup(available);
	str_ptr3 = (char *) strtok_r(tmp_available, ",", &str_ptr4);
		if (strcmp(seek, str_ptr3) == 0) {	/* we have a match */
		}
		str_ptr3 = (char *) strtok_r(NULL, ",", &str_ptr4);
/*
 * _pick_best_load - Given a specification of scheduling requirements, 
 *	identify the nodes which "best" satisfy the request.
 * 	"best" is defined as the least loaded nodes
 * IN job_ptr - pointer to job being scheduled
 * IN/OUT bitmap - usable nodes are set on input, nodes not required to 
 *	satisfy the request are cleared, other left set
 * IN min_nodes - minimum count of nodes
 * IN max_nodes - maximum count of nodes (0==don't care)
 * IN req_nodes - requested (or desired) count of nodes
 * RET zero on success, EINVAL otherwise
 * globals: node_record_count - count of nodes configured
 *	node_record_table_ptr - pointer to global node table
 * NOTE: bitmap must be a superset of req_nodes at the time that 
 *	_pick_best_load is called
 */
static int
_pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap, 
		uint32_t min_nodes, uint32_t max_nodes, 
		uint32_t req_nodes, bool test_only)
	int i, error_code = EINVAL, node_cnt = 0, prev_cnt = 0, set_cnt;

	basemap = bit_copy(bitmap);
	if (basemap == NULL)
		fatal("bit_copy malloc failure");

	set_cnt = bit_set_count(bitmap);
	if ((set_cnt < min_nodes) ||
	    ((req_nodes > min_nodes) && (set_cnt < req_nodes)))
		return error_code;	/* not usable */

	for (i=0; node_cnt<set_cnt; i++) {
		node_cnt = _job_count_bitmap(basemap, bitmap, i);
		if ((node_cnt == 0) || (node_cnt == prev_cnt))
			continue;	/* nothing new to test */
		if ((node_cnt < min_nodes) ||
		    ((req_nodes > min_nodes) && (node_cnt < req_nodes)))
		error_code = select_g_job_test(job_ptr, bitmap, 
					       min_nodes, max_nodes, 
					       req_nodes, test_only);
/*
 * Set the bits in 'jobmap' that correspond to bits in the 'bitmap'
 * that are running 'job_cnt' jobs or less, and clear the rest.
static int
_job_count_bitmap(bitstr_t * bitmap, bitstr_t * jobmap, int job_cnt) 
	bitoff_t size = bit_size(bitmap);
		if (bit_test(bitmap, i) &&
		    (node_record_table_ptr[i].run_job_cnt <= job_cnt)) {
			bit_set(jobmap, i);
			count++;
		} else {
			bit_clear(jobmap, i);
		}
/*
 * Decide if a job can share nodes with other jobs based on the
 * following three input parameters:
 *
 * IN user_flag - may be 0 (do not share nodes), 1 (node sharing allowed),
 *                or any other number means "don't care"
 * IN part_max_share - current partition's node sharing policy
 * IN cons_res_flag - 1 if the consumable resources flag is enable, 0 otherwise
 *
 * RET - 1 if nodes can be shared, 0 if nodes cannot be shared
 *
 *
 * The followed table details the node SHARED state for the various scenarios
 *
 *					part=	part=	part=	part=
 *	cons_res	user_request	EXCLUS	NO	YES	FORCE
 *	--------	------------	------	-----	-----	-----
 *	no		default/exclus	whole	whole	whole	share/O
 *	no		share=yes	whole	whole	share/O	share/O
 *	yes		default		whole	share	share/O	share/O
 *	yes		exclusive	whole	whole	whole	share/O
 *	yes		share=yes	whole	share	share/O	share/O
 *
 * whole   = whole node is allocated exclusively to the user
 * share   = nodes may be shared but the resources are not overcommitted
 * share/O = nodes are shared and the resources can be overcommitted
 *
 * part->max_share:
 *	&SHARED_FORCE 	= FORCE
 *	0		= EXCLUSIVE
 *	1		= NO
 *	> 1		= YES
 *
 * job_ptr->details->shared:
 *	(uint16_t)NO_VAL	= default
 *	0			= exclusive
 *	1			= share=yes
 *
 * Here are the desired scheduler actions to take:
 * IF cons_res enabled,     THEN 'shared' ensures that p_i_bitmap is used AND
 *				 _pick_best_load IS NOT called
 * IF cons_res NOT enabled, THEN 'shared' ensures that share_bitmap is used AND
 *				 _pick_best_load IS called
_resolve_shared_status(uint16_t user_flag, uint16_t part_max_share,
	/* no sharing if part=EXCLUSIVE */
	if (part_max_share == 0)
		return 0;
	/* sharing if part=FORCE */
	if (part_max_share & SHARED_FORCE)
		return 1;
		/* sharing unless user requested exclusive */
		if (user_flag == 0)
			return 0;
		return 1;
		/* no sharing if part=NO */
		if (part_max_share == 1)
			return 0;
		/* share if the user requested it */
		if (user_flag == 1)
			return 1;
/*
 * If the job has required feature counts, then accumulate those 
 * required resources using multiple calls to _pick_best_nodes()
 * and adding those selected nodes to the job's required node list.
 * Upon completion, return job's requirements to match the values
 * which were in effect upon calling this function.
 * Input and output are the same as _pick_best_nodes().
 */
static int
_get_req_features(struct node_set *node_set_ptr, int node_set_size,
		  bitstr_t ** select_bitmap, struct job_record *job_ptr,
		  struct part_record *part_ptr,
		  uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes)
{
	uint32_t saved_min_nodes, saved_job_min_nodes;
	bitstr_t *saved_req_node_bitmap = NULL;
	uint32_t saved_num_procs, saved_req_nodes;
	int tmp_node_set_size;
	struct node_set *tmp_node_set_ptr;
	int error_code = SLURM_SUCCESS, i;
	bitstr_t *feature_bitmap, *accumulate_bitmap = NULL;

	/* save job and request state */
	saved_min_nodes = min_nodes;
	saved_req_nodes = req_nodes;
	saved_job_min_nodes = job_ptr->details->min_nodes;
	if (job_ptr->details->req_node_bitmap)
		saved_req_node_bitmap = bit_copy(job_ptr->details->req_node_bitmap);
	job_ptr->details->req_node_bitmap = NULL;
	saved_num_procs = job_ptr->num_procs;
	job_ptr->num_procs = 1;
	tmp_node_set_ptr = xmalloc(sizeof(struct node_set) * node_set_size);

	/* Accumulate nodes with required feature counts.
	 * Ignored if job_ptr->details->req_node_layout is set (by wiki2).
	 * Selected nodes become part of job's required node list. */
	if (job_ptr->details->feature_list &&
	    (job_ptr->details->req_node_layout == NULL)) {
		ListIterator feat_iter;
		struct feature_record *feat_ptr;
		feat_iter = list_iterator_create(job_ptr->details->feature_list);
		while((feat_ptr = (struct feature_record *)
				list_next(feat_iter))) {
			if (feat_ptr->count == 0)
				continue;
			tmp_node_set_size = 0;
			/* _pick_best_nodes() is destructive of the node_set
			 * data structure, so we need to copy then purge */
			for (i=0; i<node_set_size; i++) {
				if (!_match_feature(feat_ptr->name, 
						node_set_ptr[i].features))
					continue;
				tmp_node_set_ptr[tmp_node_set_size].cpus_per_node =
					node_set_ptr[i].cpus_per_node;
				tmp_node_set_ptr[tmp_node_set_size].real_memory =
					node_set_ptr[i].real_memory;
				tmp_node_set_ptr[tmp_node_set_size].nodes =
					node_set_ptr[i].nodes;
				tmp_node_set_ptr[tmp_node_set_size].weight =
					node_set_ptr[i].weight;
				tmp_node_set_ptr[tmp_node_set_size].features = 
					xstrdup(node_set_ptr[i].features);
				tmp_node_set_ptr[tmp_node_set_size].feature_bits = 
					bit_copy(node_set_ptr[i].feature_bits);
				tmp_node_set_ptr[tmp_node_set_size].my_bitmap = 
					bit_copy(node_set_ptr[i].my_bitmap);
				tmp_node_set_size++;
			}
			feature_bitmap = NULL;
			min_nodes = feat_ptr->count;
			req_nodes = feat_ptr->count;
			job_ptr->details->min_nodes = feat_ptr->count;
			job_ptr->num_procs = feat_ptr->count;
			error_code = _pick_best_nodes(tmp_node_set_ptr, 
					tmp_node_set_size, &feature_bitmap, 
					job_ptr, part_ptr, min_nodes, 
					max_nodes, req_nodes);
#if 0
{
			char *tmp_str = bitmap2node_name(feature_bitmap);
			info("job %u needs %u nodes with feature %s, using %s", 
				job_ptr->job_id, feat_ptr->count, 
				feat_ptr->name, tmp_str);
			xfree(tmp_str);
}
#endif
			for (i=0; i<tmp_node_set_size; i++) {
				xfree(tmp_node_set_ptr[i].features);
				FREE_NULL_BITMAP(tmp_node_set_ptr[i].feature_bits);
				FREE_NULL_BITMAP(tmp_node_set_ptr[i].my_bitmap);
			}
			if (error_code != SLURM_SUCCESS)
				break;
			if (feature_bitmap) {
				if (accumulate_bitmap) {
					bit_or(accumulate_bitmap, feature_bitmap);
					bit_free(feature_bitmap);
				} else
					accumulate_bitmap = feature_bitmap;
			}
		}
		list_iterator_destroy(feat_iter);
	}

	/* restore most of job state and accumulate remaining resources */
	min_nodes = saved_min_nodes;
	req_nodes = saved_req_nodes;
	job_ptr->details->min_nodes = saved_job_min_nodes;
	job_ptr->num_procs = saved_num_procs;
	if (saved_req_node_bitmap) {
		job_ptr->details->req_node_bitmap = 
				bit_copy(saved_req_node_bitmap);
	}
	if (accumulate_bitmap) {
		if (job_ptr->details->req_node_bitmap) {
			bit_or(job_ptr->details->req_node_bitmap, 
				accumulate_bitmap);
			FREE_NULL_BITMAP(accumulate_bitmap);
		} else
			job_ptr->details->req_node_bitmap = accumulate_bitmap;
	}
	xfree(tmp_node_set_ptr);
	if (error_code == SLURM_SUCCESS) {
		error_code = _pick_best_nodes(node_set_ptr, node_set_size,
				select_bitmap, job_ptr, part_ptr, min_nodes, 
				max_nodes, req_nodes);
	}

	/* restore job's initial required node bitmap */
	FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
	job_ptr->details->req_node_bitmap = saved_req_node_bitmap;


	return error_code;
}
Moe Jette's avatar
Moe Jette committed
/*
 * _pick_best_nodes - from a weigh order list of all nodes satisfying a 
 *	job's specifications, select the "best" for use
 * IN node_set_ptr - pointer to node specification information
 * IN node_set_size - number of entries in records pointed to by node_set_ptr
 * OUT select_bitmap - returns bitmap of selected nodes, must FREE_NULL_BITMAP
 * IN job_ptr - pointer to job being scheduled
 * IN part_ptr - pointer to the partition in which the job is being scheduled
 * IN min_nodes - minimum count of nodes required by the job
 * IN max_nodes - maximum count of nodes required by the job (0==no limit)
 * IN req_nodes - requested (or desired) count of nodes
 * RET SLURM_SUCCESS on success, 
 *	ESLURM_NODES_BUSY if request can not be satisfied now, 
 *	ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE if request can never 
 *	be satisfied , or
 *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE if the job can not be 
 *	initiated until the parition's configuration changes
 * NOTE: the caller must FREE_NULL_BITMAP memory pointed to by select_bitmap
 * Notes: The algorithm is
 *	1) If required node list is specified, determine implicitly required
 *	   processor and node count 
 *	2) Determine how many disjoint required "features" are represented 
 *	3) For each feature: find matching node table entries, identify nodes 
 *	   that are up and available (idle or shared) and add them to a bit 
 *	4) If nodes _not_ shared then call select_g_job_test() to select the 
 *	   "best" of those based upon topology, else call _pick_best_load()
 *	   to pick the "best" nodes in terms of workload
 *	5) If request can't be satisfied now, execute select_g_job_test() 
 *	   against the list of nodes that exist in any state (perhaps DOWN 
 *	   DRAINED or ALLOCATED) to determine if the request can
 *         ever be satified.
Moe Jette's avatar
Moe Jette committed
 */
static int
_pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
		 bitstr_t ** select_bitmap, struct job_record *job_ptr,
		 struct part_record *part_ptr,
		 uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes)
	int error_code = SLURM_SUCCESS, i, j, pick_code;
Moe Jette's avatar
Moe Jette committed
	int total_nodes = 0, total_cpus = 0; 
	int avail_nodes = 0, avail_cpus = 0;	
	int avail_mem = 0; /* avail_: resources available for use now */
	bitstr_t *avail_bitmap = NULL, *total_bitmap = NULL;
	bitstr_t *partially_idle_node_bitmap = NULL, *possible_bitmap = NULL;
	int max_feature, min_feature;
	bool runable_ever  = false;	/* Job can ever run */
	bool runable_avail = false;	/* Job can run with available nodes */
	select_type_plugin_info_t cr_type = SELECT_TYPE_INFO_NONE; 

		info("_pick_best_nodes: empty node set for selection");
		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
        /* Is Consumable Resources enabled? */
        error_code = select_g_get_info_from_plugin (SELECT_CR_PLUGIN, 
						    &cr_enabled);
        if (error_code != SLURM_SUCCESS)
                return error_code;

	shared = _resolve_shared_status(job_ptr->details->shared,
					part_ptr->max_share, cr_enabled);
Moe Jette's avatar
Moe Jette committed
	if (cr_enabled) {
		job_ptr->cr_enabled = cr_enabled; /* CR enabled for this job */

		cr_type = (select_type_plugin_info_t) slurmctld_conf.
							select_type_param;
		if ((cr_type == CR_CORE) ||
		    (cr_type == CR_CPU)  || (cr_type == CR_SOCKET)) {
			job_ptr->details->job_max_memory = 0;
		}
                debug3("Job %u shared %d cr_enabled %d CR type %d num_procs %d", 
		     job_ptr->job_id, shared, cr_enabled, cr_type, 
		     job_ptr->num_procs);
			partially_idle_node_bitmap = bit_copy(idle_node_bitmap);
		} else {
			/* Update partially_idle_node_bitmap to reflect the
			 * idle and partially idle nodes */
			error_code = select_g_get_info_from_plugin (
					SELECT_BITMAP, 
					&partially_idle_node_bitmap);
		}

                if (error_code != SLURM_SUCCESS) {
                       FREE_NULL_BITMAP(partially_idle_node_bitmap);
                       return error_code;
                }
        }

	if (job_ptr->details->req_node_bitmap) {  /* specific nodes required */
		/* we have already confirmed that all of these nodes have a
		 * usable configuration and are in the proper partition */
			total_nodes = bit_set_count(
				job_ptr->details->req_node_bitmap);
		if (total_nodes > max_nodes) {
			/* exceeds node limit */
                        if (cr_enabled) 
                                FREE_NULL_BITMAP(partially_idle_node_bitmap);
			return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
Moe Jette's avatar
Moe Jette committed

		/* check the availability of these nodes */
		/* Should we check memory availability on these nodes? */
		if (!bit_super_set(job_ptr->details->req_node_bitmap, 
				   avail_node_bitmap)) {
			if (cr_enabled)
				FREE_NULL_BITMAP(partially_idle_node_bitmap);
			return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
		}

		if (cr_enabled) {
			if (!bit_super_set(job_ptr->details->req_node_bitmap, 
					   partially_idle_node_bitmap)) {
				FREE_NULL_BITMAP(partially_idle_node_bitmap);
				return ESLURM_NODES_BUSY;
			}
		}
Moe Jette's avatar
Moe Jette committed
		if (shared) {
			if (!bit_super_set(job_ptr->details->req_node_bitmap, 
Moe Jette's avatar
Moe Jette committed
					   share_node_bitmap)) {
				if (cr_enabled)
					FREE_NULL_BITMAP(
						partially_idle_node_bitmap);
Moe Jette's avatar
Moe Jette committed
				return ESLURM_NODES_BUSY;
Moe Jette's avatar
Moe Jette committed
		} else {
			if (!bit_super_set(job_ptr->details->req_node_bitmap, 
					   idle_node_bitmap)) {
				if (cr_enabled)
					FREE_NULL_BITMAP(
						partially_idle_node_bitmap);
Moe Jette's avatar
Moe Jette committed
				return ESLURM_NODES_BUSY;
Moe Jette's avatar
Moe Jette committed
		/* still must go through select_g_job_test() to 
		 * determine validity of request and/or perform
		 * set-up before job launch */
		total_nodes = 0;	/* reinitialize */
	/* identify the min and max feature values for exclusive OR */
	max_feature = -1;
	min_feature = MAX_FEATURES;
	for (i = 0; i < node_set_size; i++) {
		j = bit_ffs(node_set_ptr[i].feature_bits);
		if ((j >= 0) && (j < min_feature))
			min_feature = j;
		j = bit_fls(node_set_ptr[i].feature_bits);
		if ((j >= 0) && (j > max_feature))
			max_feature = j;
	for (j = min_feature; j <= max_feature; j++) {
		for (i = 0; i < node_set_size; i++) {
			if (!bit_test(node_set_ptr[i].feature_bits, j))
				error_code = _add_node_set_info(
					&node_set_ptr[i],
					&total_bitmap, 
					&total_nodes, 
					&total_cpus,
				if (error_code != SLURM_SUCCESS) {
					if (cr_enabled) {
						FREE_NULL_BITMAP(
							partially_idle_node_bitmap);
					}
Moe Jette's avatar
Moe Jette committed
					FREE_NULL_BITMAP(avail_bitmap);
					FREE_NULL_BITMAP(possible_bitmap);
			bit_and(node_set_ptr[i].my_bitmap, avail_node_bitmap);
			if (cr_enabled) {
				bit_and(node_set_ptr[i].my_bitmap,
					partially_idle_node_bitmap);
			}
			if (shared) {
				bit_and(node_set_ptr[i].my_bitmap,
					share_node_bitmap);
				bit_and(node_set_ptr[i].my_bitmap,
					idle_node_bitmap);
				bit_set_count(node_set_ptr[i].my_bitmap);
			avail_mem = job_ptr->details->job_max_memory;
			error_code = _add_node_set_info(&node_set_ptr[i], 
							&avail_bitmap, 
                                                        &avail_nodes, 
							&avail_cpus, 
                        if (error_code != SLURM_SUCCESS) {
				if (cr_enabled) { 
					FREE_NULL_BITMAP(
						partially_idle_node_bitmap);
				FREE_NULL_BITMAP(total_bitmap);
				FREE_NULL_BITMAP(avail_bitmap);
				FREE_NULL_BITMAP(possible_bitmap);
				continue; /* Keep accumulating */
			if (avail_nodes == 0)
				continue; /* Keep accumulating */
			if ((job_ptr->details->req_node_bitmap) &&
			    (!bit_super_set(job_ptr->details->req_node_bitmap, 
					avail_bitmap)))
			if ((avail_nodes  < min_nodes) ||
			    ((req_nodes   > min_nodes) && 
			     (avail_nodes < req_nodes)))
				continue;	/* Keep accumulating nodes */
			if (avail_cpus   < job_ptr->num_procs)
				continue;	/* Keep accumulating CPUs */

			/* NOTE: select_g_job_test() is destructive of
			 * avail_bitmap, so save a backup copy */
			backup_bitmap = bit_copy(avail_bitmap);
			pick_code = select_g_job_test(job_ptr, 
						      avail_bitmap, 
						      min_nodes, 
						      max_nodes,
						      req_nodes,
						      false);
			if (pick_code == SLURM_SUCCESS) {
				if (bit_set_count(avail_bitmap) > max_nodes) {
					/* end of tests for this feature */
					avail_nodes = 0; 
					break;
				}
				FREE_NULL_BITMAP(total_bitmap);
				FREE_NULL_BITMAP(possible_bitmap);
				if (cr_enabled) {
					FREE_NULL_BITMAP(
						partially_idle_node_bitmap);
				}
				*select_bitmap = avail_bitmap;
				return SLURM_SUCCESS;
			} else {
				FREE_NULL_BITMAP(avail_bitmap);
		} /* for (i = 0; i < node_set_size; i++) */
#ifndef HAVE_BG
		pick_code = 1;
		if (job_ptr->details->req_node_bitmap &&
		    !bit_super_set(job_ptr->details->req_node_bitmap,
		    		  avail_bitmap))
			pick_code = 0;
		if ((avail_nodes < min_nodes) ||
		    ((req_nodes  > min_nodes) && (avail_nodes < req_nodes)))
			pick_code = 0;
		if (avail_cpus   < job_ptr->num_procs)
			pick_code = 0;
			
		if (pick_code && cr_enabled) {
			/* now that we have all possible resources,
			 * let's call the select plugin */
			backup_bitmap = bit_copy(avail_bitmap);
			pick_code = select_g_job_test(job_ptr, 
						      avail_bitmap, 
						      min_nodes, 
						      max_nodes,
						      req_nodes,
						      false);
			if (pick_code == SLURM_SUCCESS) {
				FREE_NULL_BITMAP(backup_bitmap);
				FREE_NULL_BITMAP(total_bitmap);
				FREE_NULL_BITMAP(possible_bitmap);
				FREE_NULL_BITMAP(partially_idle_node_bitmap);
				*select_bitmap = avail_bitmap;
				return SLURM_SUCCESS;
			} else {
				FREE_NULL_BITMAP(avail_bitmap);
				avail_bitmap = backup_bitmap;
			}
		} else if (pick_code && shared) {
			/* try picking the lightest load from all
			   available nodes with this feature set */
			backup_bitmap = bit_copy(avail_bitmap);
			pick_code = _pick_best_load(job_ptr, 
						    avail_bitmap, 
						    min_nodes, 
						    max_nodes,
						    req_nodes,
						    false);
			if (pick_code == SLURM_SUCCESS) {
				FREE_NULL_BITMAP(backup_bitmap);
				if (bit_set_count(avail_bitmap) > max_nodes) {
					avail_nodes = 0; 
				} else {
					FREE_NULL_BITMAP(total_bitmap);
					FREE_NULL_BITMAP(possible_bitmap);
					if (cr_enabled) {
						FREE_NULL_BITMAP(
						    partially_idle_node_bitmap);
					}
					*select_bitmap = avail_bitmap;
					return SLURM_SUCCESS;
				}
			} else {
				FREE_NULL_BITMAP(avail_bitmap);
				avail_bitmap = backup_bitmap;
			}
		}
		/* try to get req_nodes now for this feature */
		if (avail_bitmap
		&&  (req_nodes   >  min_nodes) 
		&&  (avail_nodes >= min_nodes)
		&&  (avail_nodes <  req_nodes)
		&&  ((job_ptr->details->req_node_bitmap == NULL) ||
		     bit_super_set(job_ptr->details->req_node_bitmap, 
                                        avail_bitmap))) {
			pick_code = select_g_job_test(job_ptr, avail_bitmap, 
						      min_nodes, max_nodes,
			if ((pick_code == SLURM_SUCCESS) &&
			     (bit_set_count(avail_bitmap) <= max_nodes)) {
				FREE_NULL_BITMAP(total_bitmap);
				FREE_NULL_BITMAP(possible_bitmap);
				if (cr_enabled) { 
					FREE_NULL_BITMAP(
						partially_idle_node_bitmap);
				*select_bitmap = avail_bitmap;
				return SLURM_SUCCESS;
			}
		}

		/* determine if job could possibly run (if all configured 
		 * nodes available) */
		if (total_bitmap
		&&  (!runable_ever || !runable_avail)
		&&  (total_nodes >= min_nodes)
		&&  ((slurmctld_conf.fast_schedule == 0) ||
		     (total_cpus >= job_ptr->num_procs))
		&&  ((job_ptr->details->req_node_bitmap == NULL) ||
		     (bit_super_set(job_ptr->details->req_node_bitmap, 
					total_bitmap)))) {
			if (!runable_avail) {
				FREE_NULL_BITMAP(avail_bitmap);
				avail_bitmap = bit_copy(total_bitmap);
				if (avail_bitmap == NULL)
					fatal("bit_copy malloc failure");
				bit_and(avail_bitmap, avail_node_bitmap);
				pick_code = select_g_job_test(job_ptr, 
							      avail_bitmap, 
							      min_nodes, 
							      max_nodes,
							      true);
                                if (cr_enabled)
                                        job_ptr->cr_enabled = 1;
				if (pick_code == SLURM_SUCCESS) {
					runable_ever  = true;
					if (bit_set_count(avail_bitmap) <=
					     max_nodes)
					FREE_NULL_BITMAP(possible_bitmap);
					possible_bitmap = avail_bitmap;
					avail_bitmap = NULL;
				pick_code = select_g_job_test(job_ptr, 
							      total_bitmap, 
							      min_nodes, 
							      max_nodes,
							      true);
                                if (cr_enabled)
                                        job_ptr->cr_enabled = 1;
				if (pick_code == SLURM_SUCCESS) {
					FREE_NULL_BITMAP(possible_bitmap);
					possible_bitmap = total_bitmap;
					total_bitmap = NULL;
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(total_bitmap);
		if (error_code != SLURM_SUCCESS)
        if (cr_enabled) 
                FREE_NULL_BITMAP(partially_idle_node_bitmap);

	/* The job is not able to start right now, return a 
	 * value indicating when the job can start */
	if (!runable_avail)
		error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
	if (!runable_ever) {
		error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
		info("_pick_best_nodes %u : job never runnable", job_ptr->job_id);
	if (error_code == SLURM_SUCCESS) {
		*select_bitmap = possible_bitmap; 
	} else {
		FREE_NULL_BITMAP(possible_bitmap);