Skip to content
Snippets Groups Projects
node_scheduler.c 34.9 KiB
Newer Older
 * node_scheduler.c - select and allocated nodes to jobs 
 * see slurm.h for documentation on external functions and data structures
 *
 * NOTE: DEBUG_MODULE mode test with execution line
 *	node_scheduler ../../etc/slurm.conf2 ../../etc/slurm.jobs
 * author: moe jette, jette@llnl.gov
 */

#ifdef HAVE_CONFIG_H
#  include <config.h>
#endif

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <syslog.h>

#include "slurm.h"

#define BUF_SIZE 1024
#define NO_VAL (-99)

struct node_set {		/* set of nodes with same configuration that could be allocated */
	int cpus_per_node;
	int nodes;
	int weight;
	int feature;
Moe Jette's avatar
Moe Jette committed
};

int count_cpus (unsigned *bitmap);
int is_key_valid (int key);
int match_group (char *allow_groups, char *user_groups);
int match_feature (char *seek, char *available);
int parse_job_specs (char *job_specs, char **req_features,
		     char **req_node_list, char **job_name, char **req_group,
		     char **req_partition, int *contiguous, int *req_cpus,
		     int *req_nodes, int *min_cpus, int *min_memory,
		     int *min_tmp_disk, int *key, int *shared);
int pick_best_quadrics (bitstr_t *bitmap, bitstr_t *req_bitmap, int req_nodes,
		    int req_cpus, int consecutive);
int pick_best_nodes (struct node_set *node_set_ptr, int node_set_size,
		     bitstr_t **req_bitmap, int req_cpus, int req_nodes,
		     int contiguous, int shared, int max_nodes);
int valid_features (char *requested, char *available);
#if DEBUG_MODULE
/* main is used here for testing purposes only */
main (int argc, char *argv[]) {
	int error_code, line_num, i;
	FILE *command_file;
	char in_line[BUF_SIZE], *node_list;
	log_options_t opts = LOG_OPTS_STDERR_ONLY;

	log_init(argv[0], opts, SYSLOG_FACILITY_DAEMON, NULL);

	if (argc < 3) {
		printf ("usage: %s <SLURM_CONF_file> <slurm_job_file>\n",
			argv[0]);
		exit (1);
	}			

	error_code = init_slurm_conf ();
	if (error_code) {
		printf ("controller: error %d from init_slurm_conf\n",
			error_code);
		exit (error_code);
	}			

	error_code = read_slurm_conf (argv[1]);
	if (error_code) {
		printf ("controller: error %d from read_slurm_conf\n",
			error_code);
		exit (error_code);
	}			

	/* mark everything up and idle for testing */
	bit_nset (idle_node_bitmap, 0, node_record_count-1);
	bit_nset (up_node_bitmap,   0, node_record_count-1);

	command_file = fopen (argv[2], "r");
	if (command_file == NULL) {
		fprintf (stderr,
			 "node_scheduler: error %d opening command file %s\n",
			 errno, argv[2]);
		exit (1);
	}			

	i = valid_features ("fs1&fs2", "fs1");
	if (i != 0)
		printf ("valid_features error 1\n");
	i = valid_features ("fs1|fs2", "fs1");
	if (i != 1)
		printf ("valid_features error 2\n");
	i = valid_features ("fs1|fs2&fs3", "fs1,fs3");
	if (i != 1)
		printf ("valid_features error 3\n");
	i = valid_features ("[fs1|fs2]&fs3", "fs2,fs3");
	if (i != 2)
		printf ("valid_features error 4\n");
	i = valid_features ("fs0&[fs1|fs2]&fs3", "fs2,fs3");
	if (i != 0)
		printf ("valid_features error 5\n");
	i = valid_features ("fs3&[fs1|fs2]&fs3", "fs2,fs3");
	if (i != 2)
		printf ("valid_features error 6\n");

	line_num = 0;
	printf ("\n");
	while (fgets (in_line, BUF_SIZE, command_file)) {
		if (in_line[strlen (in_line) - 1] == '\n')
			in_line[strlen (in_line) - 1] = (char) NULL;
		line_num++;
		error_code = select_nodes (in_line, &node_list);
		if (error_code) {
			if (strncmp (in_line, "JobName=FAIL", 12) != 0)
				printf ("ERROR:");
			printf ("for job: %s\n", in_line, node_list);
			printf ("node_scheduler: error %d from select_nodes on line %d\n\n", 
				error_code, line_num);
		}
		else {
			if (strncmp (in_line, "job_name=fail", 12) == 0)
				printf ("ERROR: ");
			printf ("for job: %s\n  nodes selected %s\n\n",
				in_line, node_list);
/* for a given bitmap, change the state of specified nodes to stage_in */
/* this is a simple prototype for testing 
 * globals: node_record_count - number of nodes in the system
 *	node_record_table_ptr - pointer to global node table
 */
void 
allocate_nodes (unsigned *bitmap) {
	for (i = 0; i < node_record_count; i++) {
		if (bit_test (bitmap, i) == 0)
			continue;
		node_record_table_ptr[i].node_state = STATE_STAGE_IN;
		bit_clear (idle_node_bitmap, i);
 * count_cpus - report how many cpus are associated with the identified nodes 
 * input: bitmap - a node bitmap
 * output: returns a cpu count
 * globals: node_record_count - number of nodes configured
 *	node_record_table_ptr - pointer to global node table
Moe Jette's avatar
Moe Jette committed
 */
int 
count_cpus (unsigned *bitmap) {
Moe Jette's avatar
Moe Jette committed

 	for (i = 0; i < node_record_count; i++) {
		if (bit_test (bitmap, i) != 1)
			continue;
		sum += node_record_table_ptr[i].cpus;
 * is_key_valid - determine if supplied key is valid
 * input: key - a slurm key acquired by user root
 * output: returns 1 if key is valid, 0 otherwise
 * NOTE: this is only a placeholder for a future function
Moe Jette's avatar
Moe Jette committed
 */
 * match_feature - determine if the desired feature (seek) is one of those available
 * input: seek - desired feature
 *        available - comma separated list of features
 * output: returns 1 if found, 0 otherwise
int 
match_feature (char *seek, char *available) {
	char *tmp_available, *str_ptr3, *str_ptr4;
	int found;
	if (seek == NULL)
		return 1;	/* nothing to look for */
	if (available == NULL)
		return 0;	/* nothing to find */
	tmp_available = xmalloc (strlen (available) + 1);
	strcpy (tmp_available, available);
	found = 0;
	str_ptr3 = (char *) strtok_r (tmp_available, ",", &str_ptr4);
	while (str_ptr3) {
		if (strcmp (seek, str_ptr3) == 0) {	/* we have a match */
			found = 1;
			break;
		}		
		str_ptr3 = (char *) strtok_r (NULL, ",", &str_ptr4);
Moe Jette's avatar
Moe Jette committed
/*
 * match_group - determine if the user is a member of any groups permitted to use this partition
 * input: allow_groups - comma delimited list of groups permitted to use the partition, 
 *			NULL is for all groups
 *        user_groups - comma delimited list of groups the user belongs to
 * output: returns 1 if user is member, 0 otherwise
Moe Jette's avatar
Moe Jette committed
 */
int
match_group (char *allow_groups, char *user_groups) {
	char *tmp_allow_group, *str_ptr1, *str_ptr2;
	char *tmp_user_group, *str_ptr3, *str_ptr4;
Moe Jette's avatar
Moe Jette committed

	if ((allow_groups == NULL) ||	/* anybody can use it */
	    (strcmp (allow_groups, "all") == 0))
		return 1;
	if (user_groups == NULL)
		return 0;	/* empty group list */
Moe Jette's avatar
Moe Jette committed

	tmp_allow_group = xmalloc (strlen (allow_groups) + 1);
	strcpy (tmp_allow_group, allow_groups);
Moe Jette's avatar
Moe Jette committed

	tmp_user_group = xmalloc (strlen (user_groups) + 1);

	str_ptr1 = (char *) strtok_r (tmp_allow_group, ",", &str_ptr2);
	while (str_ptr1) {
		strcpy (tmp_user_group, user_groups);
		str_ptr3 = (char *) strtok_r (tmp_user_group, ",", &str_ptr4);
		while (str_ptr3) {
			if (strcmp (str_ptr1, str_ptr3) == 0) {	/* we have a match */
				xfree (tmp_allow_group);
				xfree (tmp_user_group);
				return 1;
			}	
			str_ptr3 = (char *) strtok_r (NULL, ",", &str_ptr4);
		}	
		str_ptr1 = (char *) strtok_r (NULL, ",", &str_ptr2);
	}		
	xfree (tmp_allow_group);
	xfree (tmp_user_group);
 * parse_job_specs - pick the appropriate fields out of a job request specification
 * input: job_specs - string containing the specification
 *        req_features, etc. - pointers to storage for the specifications
 * output: req_features, etc. - the job's specifications
 *         returns 0 if no error, errno otherwise
 * NOTE: the calling function must xfree memory at req_features[0], req_node_list[0],
 *	job_name[0], req_group[0], and req_partition[0]
Moe Jette's avatar
Moe Jette committed
 */
int 
parse_job_specs (char *job_specs, char **req_features, char **req_node_list,
		 char **job_name, char **req_group, char **req_partition,
		 int *contiguous, int *req_cpus, int *req_nodes,
		 int *min_cpus, int *min_memory, int *min_tmp_disk, int *key,
		 int *shared) {
	int bad_index, error_code, i;
	char *temp_specs;

	req_features[0] = req_node_list[0] = req_group[0] = req_partition[0] =
		job_name[0] = NULL;
	*contiguous = *req_cpus = *req_nodes = *min_cpus = *min_memory =
		*min_tmp_disk = NO_VAL;
	*key = *shared = NO_VAL;

	temp_specs = xmalloc (strlen (job_specs) + 1);
	strcpy (temp_specs, job_specs);

	error_code = load_string (job_name, "JobName=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_string (req_features, "Features=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_string (req_node_list, "NodeList=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_string (req_group, "Groups=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_string (req_partition, "Partition=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_integer (contiguous, "Contiguous", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_integer (req_cpus, "TotalCPUs=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_integer (req_nodes, "TotalNodes=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_integer (min_cpus, "MinCPUs=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_integer (min_memory, "MinMemory=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_integer (min_tmp_disk, "MinTmpDisk=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_integer (key, "Key=", temp_specs);
	if (error_code)
		goto cleanup;

	error_code = load_integer (shared, "Shared=", temp_specs);
	if (error_code)
		goto cleanup;

	bad_index = -1;
	for (i = 0; i < strlen (temp_specs); i++) {
		if (isspace ((int) temp_specs[i]) || (temp_specs[i] == '\n'))
			continue;
		bad_index = i;
		break;
	}			
Moe Jette's avatar
Moe Jette committed

		error ("parse_job_specs: bad job specification input: %s",
			 &temp_specs[bad_index]);
		error_code = EINVAL;
	}			

	req_features[0] = req_node_list[0] = req_group[0] = req_partition[0] =
		job_name[0] = NULL;
}
 * pick_best_quadrics - identify the nodes which best fit the req_nodes and req_cpus counts
 *	for a system with Quadrics elan interconnect.
 * 	"best" is defined as either single set of consecutive nodes satisfying 
 *	the request and leaving the minimum number of unused nodes OR 
 *	the fewest number of consecutive node sets
 * input: bitmap - the bit map to search
 *        req_bitmap - the bit map of nodes that must be selected, if not NULL these 
 *                     have already been confirmed to be in the input bitmap
 *        req_nodes - number of nodes required
 *        req_cpus - number of cpus required
 *        consecutive - nodes must be consecutive is 1, otherwise 0
 * output: bitmap - nodes not required to satisfy the request are cleared, other left set
 *         returns zero on success, EINVAL otherwise
 * globals: node_record_count - count of nodes configured
 *	node_record_table_ptr - pointer to global node table
 * NOTE: bitmap must be a superset of req_nodes at function call time
pick_best_quadrics (bitstr_t *bitmap, bitstr_t *req_bitmap, int req_nodes,
		int req_cpus, int consecutive) {
	int i, index, error_code, sufficient;
	int *consec_nodes;	/* how many nodes we can add from this consecutive set of nodes */
	int *consec_cpus;	/* how many nodes we can add from this consecutive set of nodes */
	int *consec_start;	/* where this consecutive set starts (index) */
	int *consec_end;	/* where this consecutive set ends (index) */
	int *consec_req;	/* are nodes from this set required (in req_bitmap) */
	int consec_index, consec_size;
	int rem_cpus, rem_nodes;	/* remaining resources required */
	int best_fit_nodes, best_fit_cpus, best_fit_req, best_fit_location,
		best_fit_sufficient;

	if (bitmap == NULL)
		fatal ("pick_best_quadrics: bitmap pointer is NULL\n");

	error_code = EINVAL;	/* default is no fit */
	consec_index = 0;
	consec_size = 50;	/* start allocation for 50 sets of consecutive nodes */
	consec_cpus  = xmalloc (sizeof (int) * consec_size);
	consec_nodes = xmalloc (sizeof (int) * consec_size);
	consec_start = xmalloc (sizeof (int) * consec_size);
	consec_end   = xmalloc (sizeof (int) * consec_size);
	consec_req   = xmalloc (sizeof (int) * consec_size);

	consec_cpus[consec_index] = consec_nodes[consec_index] = 0;
	consec_req[consec_index] = -1;	/* no required nodes here by default */
	rem_cpus = req_cpus;
	rem_nodes = req_nodes;
	for (index = 0; index < node_record_count; index++) {
		if (bit_test (bitmap, index)) {
			if (consec_nodes[consec_index] == 0)
				consec_start[consec_index] = index;
			i = node_record_table_ptr[index].cpus;
			if (req_bitmap && bit_test (req_bitmap, index)) {
				if (consec_req[consec_index] == -1) /* first required node in set */
					consec_req[consec_index] = index; 
				rem_cpus -= i;	/* reduce count of additional resources required */
				rem_nodes--;	/* reduce count of additional resources required */
			}
			else {
				consec_cpus[consec_index] += i;
				consec_nodes[consec_index]++;
			}	
		}
		else if (consec_nodes[consec_index] == 0) {
			consec_req[consec_index] = -1;	/* already picked up any required nodes */
			/* re-use this record */
		}
		else {
			consec_end[consec_index] = index - 1;
			if (++consec_index >= consec_size) {
				consec_size *= 2;
				xrealloc (consec_cpus,  sizeof (int) * consec_size);
				xrealloc (consec_nodes, sizeof (int) * consec_size);
				xrealloc (consec_start, sizeof (int) * consec_size);
				xrealloc (consec_end,   sizeof (int) * consec_size);
				xrealloc (consec_req,   sizeof (int) * consec_size);
			}	
			consec_cpus[consec_index] = 0;
			consec_nodes[consec_index] = 0;
			consec_req[consec_index] = -1;
		}		
	}
	if (consec_nodes[consec_index] != 0)
		consec_end[consec_index++] = index - 1;
	info ("rem_cpus=%d, rem_nodes=%d", rem_cpus, rem_nodes);
	for (i = 0; i < consec_index; i++) {
		if (consec_req[i] != -1)
			info ("start=%s, end=%s, nodes=%d, cpus=%d, req=%s",
				node_record_table_ptr[consec_start[i]].name,
				node_record_table_ptr[consec_end[i]].name,
				consec_nodes[i], consec_cpus[i],
				node_record_table_ptr[consec_req[i]].name);
		else
			info ("start=%s, end=%s, nodes=%d, cpus=%d",
				node_record_table_ptr[consec_start[i]].name,
				node_record_table_ptr[consec_end[i]].name,
				consec_nodes[i], consec_cpus[i]);
	while (consec_index) {
		best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
		best_fit_req = -1;	/* first required node, -1 if none */
		for (i = 0; i < consec_index; i++) {
			if (consec_nodes[i] == 0)
				continue;
			sufficient = ((consec_nodes[i] >= rem_nodes)
				      && (consec_cpus[i] >= rem_cpus));
			if ((best_fit_nodes == 0) ||	/* first possibility */
			    ((best_fit_req == -1) && (consec_req[i] != -1)) ||	/* required nodes */
			    (sufficient && (best_fit_sufficient == 0)) ||	/* first large enough */
			    (sufficient && (consec_cpus[i] < best_fit_cpus)) ||	/* less waste option */
			    ((sufficient == 0) && (consec_cpus[i] > best_fit_cpus))) {	/* larger option */
				best_fit_cpus = consec_cpus[i];
				best_fit_nodes = consec_nodes[i];
				best_fit_location = i;
				best_fit_req = consec_req[i];
				best_fit_sufficient = sufficient;
			}	
		if (consecutive && ((best_fit_nodes < rem_nodes) || (best_fit_cpus < rem_cpus)))
			break;	/* no hole large enough */
		if (best_fit_req != -1) {	/* work out from required nodes */
			for (i = best_fit_req;
			     i <= consec_end[best_fit_location]; i++) {
				if ((rem_nodes <= 0) && (rem_cpus <= 0))
					break;
				rem_nodes--;
				rem_cpus -= node_record_table_ptr[i].cpus;
			for (i = (best_fit_req - 1);
			     i >= consec_start[best_fit_location]; i--) {
				if ((rem_nodes <= 0) && (rem_cpus <= 0))
					break;
				/* if (bit_test(bitmap, i)) continue;  nothing set earlier */
				bit_set (bitmap, i);
				rem_nodes--;
				rem_cpus -= node_record_table_ptr[i].cpus;
		}
		else {
			for (i = consec_start[best_fit_location];
			     i <= consec_end[best_fit_location]; i++) {
				if ((rem_nodes <= 0) && (rem_cpus <= 0))
					break;
				rem_nodes--;
				rem_cpus -= node_record_table_ptr[i].cpus;
		}		
		if ((rem_nodes <= 0) && (rem_cpus <= 0)) {
			error_code = 0;
			break;
		}		
		consec_cpus[best_fit_location] = 0;
		consec_nodes[best_fit_location] = 0;
	}			

      cleanup:
	if (consec_cpus)
Moe Jette's avatar
Moe Jette committed
/*
 * pick_best_nodes - from nodes satisfying partition and configuration specifications, 
Moe Jette's avatar
Moe Jette committed
 *	select the "best" for use
 * input: node_set_ptr - pointer to node specification information
 *        node_set_size - number of entries in records pointed to by node_set_ptr
 *        req_bitmap - pointer to bitmap of specific nodes required by the job, could be NULL
 *        req_cpus - count of cpus required by the job
 *        req_nodes - count of nodes required by the job
 *        contiguous - set to 1 if allocated nodes must be contiguous, 0 otherwise
 *        shared - set to 1 if nodes may be shared, 0 otherwise
 *        max_nodes - maximum number of nodes permitted for job, -1 for none (partition limit)
 * output: req_bitmap - pointer to bitmap of selected nodes
 *         returns 0 on success, EAGAIN if request can not be satisfied now, 
Moe Jette's avatar
Moe Jette committed
 *		EINVAL if request can never be satisfied (insufficient contiguous nodes)
 * NOTE: the caller must xfree memory pointed to by req_bitmap
Moe Jette's avatar
Moe Jette committed
 */
int
pick_best_nodes (struct node_set *node_set_ptr, int node_set_size,
		 bitstr_t **req_bitmap, int req_cpus, int req_nodes,
		 int contiguous, int shared, int max_nodes) {
	int error_code, i, j, size;
	int total_nodes, total_cpus;	/* total resources configured in partition */
	int avail_nodes, avail_cpus;	/* resources available for use now */
	bitstr_t *avail_bitmap, *total_bitmap;
	int max_feature, min_feature;
	int *cpus_per_node;
	int avail_set, total_set, runable;

	if (node_set_size == 0)
		return EINVAL;
	if ((max_nodes != -1) && (req_nodes > max_nodes))
		return EINVAL;
	error_code = 0;
	avail_bitmap = total_bitmap = NULL;
	avail_nodes = avail_cpus = 0;
	total_nodes = total_cpus = 0;
	if (req_bitmap[0]) {	/* specific nodes required */
		/* NOTE: we have already confirmed that all of these nodes have a usable */
		/*       configuration and are in the proper partition */
		if (req_nodes != 0)
			total_nodes = bit_set_count (req_bitmap[0]);
		if (req_cpus != 0)
			total_cpus = count_cpus (req_bitmap[0]);
		if (total_nodes > max_nodes)
			return EINVAL;
		if ((req_nodes <= total_nodes) && (req_cpus <= total_cpus)) {
			if (bit_super_set (req_bitmap[0], up_node_bitmap) != 1)
			if ((shared != 1) &&
			    (bit_super_set (req_bitmap[0], idle_node_bitmap) != 1))
				return EAGAIN;
			return 0;	/* user can have selected nodes, we're done! */
		}		
		total_nodes = total_cpus = 0;	/* reinitialize */
	}			

	/* identify how many feature sets we have (e.g. "[fs1|fs2|fs3|fs4]" */
	max_feature = min_feature = node_set_ptr[0].feature;
	for (i = 1; i < node_set_size; i++) {
		if (node_set_ptr[i].feature > max_feature)
			max_feature = node_set_ptr[i].feature;
		if (node_set_ptr[i].feature < min_feature)
			min_feature = node_set_ptr[i].feature;

	runable = 0;		/* assume not runable until otherwise demonstrated */
	for (j = min_feature; j <= max_feature; j++) {
		avail_set = total_set = 0;
		for (i = 0; i < node_set_size; i++) {
			if (node_set_ptr[i].feature != j)
				continue;
			if (runable == 0) {
				if (total_set)
						   node_set_ptr[i].my_bitmap);
				else {
					total_bitmap = bit_copy (node_set_ptr[i].my_bitmap);
					if (total_bitmap == NULL) 
						fatal ("bit_copy failed to allocate memory");
					total_set = 1;
				}	
				total_nodes += node_set_ptr[i].nodes;
				total_cpus +=
					(node_set_ptr[i].nodes * node_set_ptr[i].cpus_per_node);
			bit_and (node_set_ptr[i].my_bitmap,
				bit_and (node_set_ptr[i].my_bitmap,
					    idle_node_bitmap);
			node_set_ptr[i].nodes =
				bit_set_count (node_set_ptr[i].my_bitmap);
					   node_set_ptr[i].my_bitmap);
			else {
				avail_bitmap = bit_copy (node_set_ptr[i].my_bitmap);
				if (avail_bitmap == NULL) 
					fatal ("bit_copy memory allocation failure");
				avail_set = 1;
			}	
			avail_nodes += node_set_ptr[i].nodes;
			avail_cpus +=
				(node_set_ptr[i].nodes *
				 node_set_ptr[i].cpus_per_node);
			if ((req_bitmap[0])
			    && (bit_super_set (req_bitmap[0], avail_bitmap)
				== 0))
				continue;
			if (avail_nodes < req_nodes)
				continue;
			if (avail_cpus < req_cpus)
				continue;
			error_code =
				pick_best_quadrics (avail_bitmap, req_bitmap[0],
						req_nodes, req_cpus,
						contiguous);
			if ((error_code == 0) && (max_nodes != -1)
			    && (bit_set_count (avail_bitmap) > max_nodes)) {
				error_code = EINVAL;
				break;
			}	
			if (error_code == 0) {
				if (total_bitmap)
				req_bitmap[0] = avail_bitmap;
				return 0;
			}	
		if ((error_code == 0) && (runable == 0) &&
		    (total_nodes > req_nodes) && (total_cpus > req_cpus) &&
		    ((req_bitmap[0] == NULL)
		     || (bit_super_set (req_bitmap[0], avail_bitmap) == 1))
		    && ((max_nodes == -1) || (req_nodes <= max_nodes))) {
			/* determine if job could possibly run (if configured nodes all available) */
			error_code =
				pick_best_quadrics (total_bitmap, req_bitmap[0],
						req_nodes, req_cpus,
						contiguous);
			if ((error_code == 0) && (max_nodes != -1)
			    && (bit_set_count (avail_bitmap) > max_nodes))
				error_code = EINVAL;
			if (error_code == 0)
				runable = 1;
		}		
		if (avail_bitmap)
		avail_bitmap = total_bitmap = NULL;
		if (error_code != 0)
			break;

	if (runable == 0)
		error_code = EINVAL;
	if (error_code == 0)
		error_code = EAGAIN;
	return error_code;
}
 * select_nodes - select and allocate nodes to a job with the given specifications
 * input: job_specs - job specifications
 *        node_list - pointer to node list returned
 * output: node_list - list of allocated nodes
 *         returns 0 on success, EINVAL if not possible to satisfy request, 
 *		or EAGAIN if resources are presently busy
 * NOTE: the calling program must xfree the memory pointed to by node_list
int
select_nodes (char *job_specs, char **node_list) {
	char *req_features, *req_node_list, *job_name, *req_group,
		*req_partition, *out_line;
	int contiguous, req_cpus, req_nodes, min_cpus, min_memory,
		min_tmp_disk;
	int error_code, cpu_tally, node_tally, key, shared;
	struct part_record *part_ptr;
	bitstr_t *req_bitmap, *scratch_bitmap;
	ListIterator config_record_iterator;	/* for iterating through config_list */
	struct config_record *config_record_point;	/* pointer to config_record */
	int i;
	struct node_set *node_set_ptr;
	int node_set_index, node_set_size;

	req_features = req_node_list = job_name = req_group = req_partition =
		NULL;
	req_bitmap = scratch_bitmap = NULL;
	contiguous = req_cpus = req_nodes = min_cpus = min_memory =
		min_tmp_disk = NO_VAL;
	key = shared = NO_VAL;
	node_set_ptr = NULL;
	config_record_iterator = NULL;
	node_list[0] = NULL;
	config_record_iterator = (ListIterator) NULL;
	node_lock ();
	part_lock ();

	/* setup and basic parsing */
	error_code =
		parse_job_specs (job_specs, &req_features, &req_node_list,
				 &job_name, &req_group, &req_partition,
				 &contiguous, &req_cpus, &req_nodes,
				 &min_cpus, &min_memory, &min_tmp_disk, &key,
				 &shared);
	if (error_code != 0) {
		error_code = EINVAL;	/* permanent error, invalid parsing */
		error ("select_nodes: parsing failure on %s", job_specs);
	if ((req_cpus == NO_VAL) && (req_nodes == NO_VAL) && (req_node_list == NULL)) {
		info ("select_nodes: job failed to specify node_list, cpu or node count");
		error_code = EINVAL;
		goto cleanup;
	}			
	if (contiguous == NO_VAL)
		contiguous = 0;	/* default not contiguous */
	if (req_cpus == NO_VAL)
		req_cpus = 0;	/* default no cpu count requirements */
	if (req_nodes == NO_VAL)
		req_nodes = 0;	/* default no node count requirements */


	/* find selected partition */
	if (req_partition) {
		part_ptr =
			list_find_first (part_list, &list_find_part,
					 req_partition);
		if (part_ptr == NULL) {
			info ("select_nodes: invalid partition specified: %s",
				 req_partition);
			error_code = EINVAL;
			goto cleanup;
		}		
	}
	else {
		if (default_part_loc == NULL) {
			error ("select_nodes: default partition not set.");
			error_code = EINVAL;
			goto cleanup;
		}		
		part_ptr = default_part_loc;
	}			
	/* can this user access this partition */
	if (part_ptr->key && (is_key_valid (key) == 0)) {
		info ("select_nodes: job lacks key required of partition %s",
			 part_ptr->name);
		error_code = EINVAL;
		goto cleanup;
	}			
	if (match_group (part_ptr->allow_groups, req_group) == 0) {
		info ("select_nodes: job lacks group required of partition %s",
			 part_ptr->name);
		error_code = EINVAL;
		goto cleanup;
	}			


	/* check if select partition has sufficient resources to satisfy request */
	if (req_node_list) {	/* insure that selected nodes are in this partition */
		error_code = node_name2bitmap (req_node_list, &req_bitmap);
		if (error_code == EINVAL)
			goto cleanup;
		if (error_code != 0) {
			error_code = EAGAIN;	/* no memory */
			goto cleanup;
		}		
		if (contiguous == 1)
			bit_fill_gaps (req_bitmap);
		if (bit_super_set (req_bitmap, part_ptr->node_bitmap) != 1) {
			info ("select_nodes: requested nodes %s not in partition %s",
				req_node_list, part_ptr->name);
			error_code = EINVAL;
			goto cleanup;
		}		
		i = count_cpus (req_bitmap);
		if (i > req_cpus)
			req_cpus = i;
		i = bit_set_count (req_bitmap);
		if (i > req_nodes)
			req_nodes = i;
	}			
	if (req_cpus > part_ptr->total_cpus) {
		info ("select_nodes: too many cpus (%d) requested of partition %s(%d)",
			req_cpus, part_ptr->name, part_ptr->total_cpus);
		error_code = EINVAL;
		goto cleanup;
	}			
	if ((req_nodes > part_ptr->total_nodes)
	    || (req_nodes > part_ptr->max_nodes)) {
		if (part_ptr->total_nodes > part_ptr->max_nodes)
			i = part_ptr->max_nodes;
		else
			i = part_ptr->total_nodes;
		info ("select_nodes: too many nodes (%d) requested of partition %s(%d)",
			 req_nodes, part_ptr->name, i);
		error_code = EINVAL;
		goto cleanup;
	}			
	if (part_ptr->shared == 2)	/* shared=force */
		shared = 1;
	else if ((shared != 1) || (part_ptr->shared == 0)) /* user or partition want no sharing */
		shared = 0;


	/* pick up nodes from the weight ordered configuration list */
	node_set_index = 0;
	node_set_size = 0;
	node_set_ptr = (struct node_set *) xmalloc (sizeof (struct node_set));
	node_set_ptr[node_set_size++].my_bitmap = NULL;

	config_record_iterator = list_iterator_create (config_list);
	if (config_record_iterator == NULL) {
		fatal ("select_nodes: ListIterator_create unable to allocate memory");
		error_code = EAGAIN;
		goto cleanup;
	}			

	while (config_record_point =
	       (struct config_record *) list_next (config_record_iterator)) {
		int tmp_feature, check_node_config;

		tmp_feature =
			valid_features (req_features,
					config_record_point->feature);
		if (tmp_feature == 0)
			continue;

		/* since nodes can register with more resources than defined in the configuration,    */
		/* we want to use those higher values for scheduling, but only as needed */
		if ((min_cpus > config_record_point->cpus) ||
		    (min_memory > config_record_point->real_memory) ||
		    (min_tmp_disk > config_record_point->tmp_disk))
			check_node_config = 1;
		else
			check_node_config = 0;
		node_set_ptr[node_set_index].my_bitmap =
			bit_copy (config_record_point->node_bitmap);
		if (node_set_ptr[node_set_index].my_bitmap == NULL)
			fatal ("bit_copy memory allocation failure");
		bit_and (node_set_ptr[node_set_index].my_bitmap,
			    part_ptr->node_bitmap);
		node_set_ptr[node_set_index].nodes =
			bit_set_count (node_set_ptr[node_set_index].my_bitmap);
		/* check configuration of individual nodes only if the check of baseline */
		/* values in the configuration file are too low. this will slow the scheduling */
		/* for very large cluster. */
		if (check_node_config
		    && (node_set_ptr[node_set_index].nodes != 0)) {
			for (i = 0; i < node_record_count; i++) {
				if (bit_test
				    (node_set_ptr[node_set_index].my_bitmap, i) == 0)
					continue;
				if ((min_cpus <=
				     node_record_table_ptr[i].cpus)
				    && (min_memory <=
					node_record_table_ptr[i].real_memory)
				    && (min_tmp_disk <=
					node_record_table_ptr[i].tmp_disk))
					continue;
				bit_clear (node_set_ptr[node_set_index].my_bitmap, i);
				if ((--node_set_ptr[node_set_index].nodes) ==
				    0)
					break;
		}		
		if (node_set_ptr[node_set_index].nodes == 0) {
			bit_free (node_set_ptr[node_set_index].my_bitmap);
			node_set_ptr[node_set_index].my_bitmap = NULL;
			continue;
		}		
		if (req_bitmap) {
			if (scratch_bitmap)
					   node_set_ptr[node_set_index].
					   my_bitmap);
			else {
				scratch_bitmap =
					bit_copy (node_set_ptr[node_set_index].my_bitmap);
				if (scratch_bitmap == NULL)
					fatal ("bit_copy memory allocation failure");
			}	
		}		
		node_set_ptr[node_set_index].cpus_per_node =
			config_record_point->cpus;
		node_set_ptr[node_set_index].weight =
			config_record_point->weight;
		node_set_ptr[node_set_index].feature = tmp_feature;
Moe Jette's avatar
Moe Jette committed
#if DEBUG_MODULE > 1
		info ("found %d usable nodes from configuration with %s",
			node_set_ptr[node_set_index].nodes,
			config_record_point->nodes);
		xrealloc (node_set_ptr, sizeof (struct node_set) * (node_set_index + 1));
		node_set_ptr[node_set_size++].my_bitmap = NULL;
	}			
	if (node_set_index == 0) {
		info ("select_nodes: no node configurations satisfy requirements %d:%d:%d:%s",
			 min_cpus, min_memory, min_tmp_disk, req_features);
		error_code = EINVAL;
		goto cleanup;
	}			
	if (node_set_ptr[node_set_index].my_bitmap)
		bit_free (node_set_ptr[node_set_index].my_bitmap);
	node_set_ptr[node_set_index].my_bitmap = NULL;
	node_set_size = node_set_index;

	if (req_bitmap) {
		if ((scratch_bitmap == NULL)
		    || (bit_super_set (req_bitmap, scratch_bitmap) != 1)) {
			info ("select_nodes: requested nodes do not satisfy configurations requirements %d:%d:%d:%s",
				 min_cpus, min_memory, min_tmp_disk, req_features);
			error_code = EINVAL;
			goto cleanup;
		}		
	}			


	/* pick the nodes providing a best-fit */
	error_code = pick_best_nodes (node_set_ptr, node_set_size,
				      &req_bitmap, req_cpus, req_nodes,
				      contiguous, shared,
				      part_ptr->max_nodes);
	if (error_code == EAGAIN)
		goto cleanup;
	if (error_code == EINVAL) {
		info ("select_nodes: no nodes can satisfy job request");