Newer
Older
* node_scheduler.c - select and allocated nodes to jobs
* see slurm.h for documentation on external functions and data structures
*
* NOTE: DEBUG_MODULE mode test with execution line
* node_scheduler ../../etc/slurm.conf2 ../../etc/slurm.jobs
* author: moe jette, jette@llnl.gov
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <syslog.h>
#include "slurm.h"
#define BUF_SIZE 1024
#define NO_VAL (-99)
struct node_set { /* set of nodes with same configuration that could be allocated */
int cpus_per_node;
int nodes;
int weight;
int feature;
bitstr_t *my_bitmap;
int count_cpus (unsigned *bitmap);
int is_key_valid (int key);
int match_group (char *allow_groups, char *user_groups);
int match_feature (char *seek, char *available);
int parse_job_specs (char *job_specs, char **req_features,
char **req_node_list, char **job_name, char **req_group,
char **req_partition, int *contiguous, int *req_cpus,
int *req_nodes, int *min_cpus, int *min_memory,
int *min_tmp_disk, int *key, int *shared);
int pick_best_quadrics (bitstr_t *bitmap, bitstr_t *req_bitmap, int req_nodes,
int req_cpus, int consecutive);
int pick_best_nodes (struct node_set *node_set_ptr, int node_set_size,
bitstr_t **req_bitmap, int req_cpus, int req_nodes,
int contiguous, int shared, int max_nodes);
int valid_features (char *requested, char *available);
#if DEBUG_MODULE
/* main is used here for testing purposes only */
main (int argc, char *argv[]) {
int error_code, line_num, i;
FILE *command_file;
char in_line[BUF_SIZE], *node_list;
log_options_t opts = LOG_OPTS_STDERR_ONLY;
log_init(argv[0], opts, SYSLOG_FACILITY_DAEMON, NULL);
if (argc < 3) {
printf ("usage: %s <SLURM_CONF_file> <slurm_job_file>\n",
argv[0]);
exit (1);
}
error_code = init_slurm_conf ();
if (error_code) {
printf ("controller: error %d from init_slurm_conf\n",
error_code);
exit (error_code);
}
error_code = read_slurm_conf (argv[1]);
if (error_code) {
printf ("controller: error %d from read_slurm_conf\n",
error_code);
exit (error_code);
}
/* mark everything up and idle for testing */
bit_nset (idle_node_bitmap, 0, node_record_count-1);
bit_nset (up_node_bitmap, 0, node_record_count-1);
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
command_file = fopen (argv[2], "r");
if (command_file == NULL) {
fprintf (stderr,
"node_scheduler: error %d opening command file %s\n",
errno, argv[2]);
exit (1);
}
i = valid_features ("fs1&fs2", "fs1");
if (i != 0)
printf ("valid_features error 1\n");
i = valid_features ("fs1|fs2", "fs1");
if (i != 1)
printf ("valid_features error 2\n");
i = valid_features ("fs1|fs2&fs3", "fs1,fs3");
if (i != 1)
printf ("valid_features error 3\n");
i = valid_features ("[fs1|fs2]&fs3", "fs2,fs3");
if (i != 2)
printf ("valid_features error 4\n");
i = valid_features ("fs0&[fs1|fs2]&fs3", "fs2,fs3");
if (i != 0)
printf ("valid_features error 5\n");
i = valid_features ("fs3&[fs1|fs2]&fs3", "fs2,fs3");
if (i != 2)
printf ("valid_features error 6\n");
line_num = 0;
printf ("\n");
while (fgets (in_line, BUF_SIZE, command_file)) {
if (in_line[strlen (in_line) - 1] == '\n')
in_line[strlen (in_line) - 1] = (char) NULL;
line_num++;
error_code = select_nodes (in_line, &node_list);
if (error_code) {
if (strncmp (in_line, "JobName=FAIL", 12) != 0)
printf ("ERROR:");
printf ("for job: %s\n", in_line, node_list);
printf ("node_scheduler: error %d from select_nodes on line %d\n\n",
error_code, line_num);
}
else {
if (strncmp (in_line, "job_name=fail", 12) == 0)
printf ("ERROR: ");
printf ("for job: %s\n nodes selected %s\n\n",
in_line, node_list);
xfree (node_list);
}
}
}
#endif
/* for a given bitmap, change the state of specified nodes to stage_in */
/* this is a simple prototype for testing
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
*/
void
allocate_nodes (unsigned *bitmap) {
for (i = 0; i < node_record_count; i++) {
if (bit_test (bitmap, i) == 0)
continue;
node_record_table_ptr[i].node_state = STATE_STAGE_IN;
bit_clear (idle_node_bitmap, i);
return;
}
* count_cpus - report how many cpus are associated with the identified nodes
* input: bitmap - a node bitmap
* output: returns a cpu count
* globals: node_record_count - number of nodes configured
* node_record_table_ptr - pointer to global node table
int
count_cpus (unsigned *bitmap) {
int i, sum;
sum = 0;
for (i = 0; i < node_record_count; i++) {
if (bit_test (bitmap, i) != 1)
continue;
sum += node_record_table_ptr[i].cpus;
return sum;
}
* is_key_valid - determine if supplied key is valid
* input: key - a slurm key acquired by user root
* output: returns 1 if key is valid, 0 otherwise
* NOTE: this is only a placeholder for a future function
int
is_key_valid (int key) {
if (key == NO_VAL)
return 0;
return 1;
}
* match_feature - determine if the desired feature (seek) is one of those available
* input: seek - desired feature
* available - comma separated list of features
* output: returns 1 if found, 0 otherwise
int
match_feature (char *seek, char *available) {
char *tmp_available, *str_ptr3, *str_ptr4;
int found;
if (seek == NULL)
return 1; /* nothing to look for */
if (available == NULL)
return 0; /* nothing to find */
tmp_available = xmalloc (strlen (available) + 1);
strcpy (tmp_available, available);
found = 0;
str_ptr3 = (char *) strtok_r (tmp_available, ",", &str_ptr4);
while (str_ptr3) {
if (strcmp (seek, str_ptr3) == 0) { /* we have a match */
found = 1;
break;
}
str_ptr3 = (char *) strtok_r (NULL, ",", &str_ptr4);
xfree (tmp_available);
return found;
}
* match_group - determine if the user is a member of any groups permitted to use this partition
* input: allow_groups - comma delimited list of groups permitted to use the partition,
* NULL is for all groups
* user_groups - comma delimited list of groups the user belongs to
* output: returns 1 if user is member, 0 otherwise
int
match_group (char *allow_groups, char *user_groups) {
char *tmp_allow_group, *str_ptr1, *str_ptr2;
char *tmp_user_group, *str_ptr3, *str_ptr4;
if ((allow_groups == NULL) || /* anybody can use it */
(strcmp (allow_groups, "all") == 0))
return 1;
if (user_groups == NULL)
return 0; /* empty group list */
tmp_allow_group = xmalloc (strlen (allow_groups) + 1);
strcpy (tmp_allow_group, allow_groups);
tmp_user_group = xmalloc (strlen (user_groups) + 1);
str_ptr1 = (char *) strtok_r (tmp_allow_group, ",", &str_ptr2);
while (str_ptr1) {
strcpy (tmp_user_group, user_groups);
str_ptr3 = (char *) strtok_r (tmp_user_group, ",", &str_ptr4);
while (str_ptr3) {
if (strcmp (str_ptr1, str_ptr3) == 0) { /* we have a match */
xfree (tmp_allow_group);
xfree (tmp_user_group);
return 1;
}
str_ptr3 = (char *) strtok_r (NULL, ",", &str_ptr4);
}
str_ptr1 = (char *) strtok_r (NULL, ",", &str_ptr2);
}
xfree (tmp_allow_group);
xfree (tmp_user_group);
return 0; /* no match */
}
* parse_job_specs - pick the appropriate fields out of a job request specification
* input: job_specs - string containing the specification
* req_features, etc. - pointers to storage for the specifications
* output: req_features, etc. - the job's specifications
* returns 0 if no error, errno otherwise
* NOTE: the calling function must xfree memory at req_features[0], req_node_list[0],
* job_name[0], req_group[0], and req_partition[0]
int
parse_job_specs (char *job_specs, char **req_features, char **req_node_list,
char **job_name, char **req_group, char **req_partition,
int *contiguous, int *req_cpus, int *req_nodes,
int *min_cpus, int *min_memory, int *min_tmp_disk, int *key,
int *shared) {
int bad_index, error_code, i;
char *temp_specs;
req_features[0] = req_node_list[0] = req_group[0] = req_partition[0] =
job_name[0] = NULL;
*contiguous = *req_cpus = *req_nodes = *min_cpus = *min_memory =
*min_tmp_disk = NO_VAL;
*key = *shared = NO_VAL;
temp_specs = xmalloc (strlen (job_specs) + 1);
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
strcpy (temp_specs, job_specs);
error_code = load_string (job_name, "JobName=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_string (req_features, "Features=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_string (req_node_list, "NodeList=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_string (req_group, "Groups=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_string (req_partition, "Partition=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_integer (contiguous, "Contiguous", temp_specs);
if (error_code)
goto cleanup;
error_code = load_integer (req_cpus, "TotalCPUs=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_integer (req_nodes, "TotalNodes=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_integer (min_cpus, "MinCPUs=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_integer (min_memory, "MinMemory=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_integer (min_tmp_disk, "MinTmpDisk=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_integer (key, "Key=", temp_specs);
if (error_code)
goto cleanup;
error_code = load_integer (shared, "Shared=", temp_specs);
if (error_code)
goto cleanup;
bad_index = -1;
for (i = 0; i < strlen (temp_specs); i++) {
if (isspace ((int) temp_specs[i]) || (temp_specs[i] == '\n'))
continue;
bad_index = i;
break;
}
if (bad_index != -1) {
error ("parse_job_specs: bad job specification input: %s",
&temp_specs[bad_index]);
error_code = EINVAL;
}
xfree (temp_specs);
return error_code;
cleanup:
xfree (temp_specs);
if (req_features[0])
xfree (req_features[0]);
if (req_node_list[0])
xfree (req_node_list[0]);
if (req_group[0])
xfree (req_group[0]);
if (req_partition[0])
xfree (req_partition[0]);
if (job_name[0])
xfree (job_name[0]);
req_features[0] = req_node_list[0] = req_group[0] = req_partition[0] =
job_name[0] = NULL;
}
* pick_best_quadrics - identify the nodes which best fit the req_nodes and req_cpus counts
* for a system with Quadrics elan interconnect.
* "best" is defined as either single set of consecutive nodes satisfying
* the request and leaving the minimum number of unused nodes OR
* the fewest number of consecutive node sets
* input: bitmap - the bit map to search
* req_bitmap - the bit map of nodes that must be selected, if not NULL these
* have already been confirmed to be in the input bitmap
* req_nodes - number of nodes required
* req_cpus - number of cpus required
* consecutive - nodes must be consecutive is 1, otherwise 0
* output: bitmap - nodes not required to satisfy the request are cleared, other left set
* returns zero on success, EINVAL otherwise
* globals: node_record_count - count of nodes configured
* node_record_table_ptr - pointer to global node table
* NOTE: bitmap must be a superset of req_nodes at function call time
pick_best_quadrics (bitstr_t *bitmap, bitstr_t *req_bitmap, int req_nodes,
int req_cpus, int consecutive) {
int i, index, error_code, sufficient;
int *consec_nodes; /* how many nodes we can add from this consecutive set of nodes */
int *consec_cpus; /* how many nodes we can add from this consecutive set of nodes */
int *consec_start; /* where this consecutive set starts (index) */
int *consec_end; /* where this consecutive set ends (index) */
int *consec_req; /* are nodes from this set required (in req_bitmap) */
int consec_index, consec_size;
int rem_cpus, rem_nodes; /* remaining resources required */
int best_fit_nodes, best_fit_cpus, best_fit_req, best_fit_location,
best_fit_sufficient;
if (bitmap == NULL)
fatal ("pick_best_quadrics: bitmap pointer is NULL\n");
error_code = EINVAL; /* default is no fit */
consec_index = 0;
consec_size = 50; /* start allocation for 50 sets of consecutive nodes */
consec_cpus = xmalloc (sizeof (int) * consec_size);
consec_nodes = xmalloc (sizeof (int) * consec_size);
consec_start = xmalloc (sizeof (int) * consec_size);
consec_end = xmalloc (sizeof (int) * consec_size);
consec_req = xmalloc (sizeof (int) * consec_size);
consec_cpus[consec_index] = consec_nodes[consec_index] = 0;
consec_req[consec_index] = -1; /* no required nodes here by default */
rem_cpus = req_cpus;
rem_nodes = req_nodes;
for (index = 0; index < node_record_count; index++) {
if (bit_test (bitmap, index)) {
if (consec_nodes[consec_index] == 0)
consec_start[consec_index] = index;
i = node_record_table_ptr[index].cpus;
if (req_bitmap && bit_test (req_bitmap, index)) {
if (consec_req[consec_index] == -1) /* first required node in set */
consec_req[consec_index] = index;
rem_cpus -= i; /* reduce count of additional resources required */
rem_nodes--; /* reduce count of additional resources required */
}
else {
bit_clear (bitmap, index);
consec_cpus[consec_index] += i;
consec_nodes[consec_index]++;
}
}
else if (consec_nodes[consec_index] == 0) {
consec_req[consec_index] = -1; /* already picked up any required nodes */
/* re-use this record */
}
else {
consec_end[consec_index] = index - 1;
if (++consec_index >= consec_size) {
consec_size *= 2;
xrealloc (consec_cpus, sizeof (int) * consec_size);
xrealloc (consec_nodes, sizeof (int) * consec_size);
xrealloc (consec_start, sizeof (int) * consec_size);
xrealloc (consec_end, sizeof (int) * consec_size);
xrealloc (consec_req, sizeof (int) * consec_size);
}
consec_cpus[consec_index] = 0;
consec_nodes[consec_index] = 0;
consec_req[consec_index] = -1;
}
}
if (consec_nodes[consec_index] != 0)
consec_end[consec_index++] = index - 1;
#if DEBUG_SYSTEM > 1
info ("rem_cpus=%d, rem_nodes=%d", rem_cpus, rem_nodes);
for (i = 0; i < consec_index; i++) {
if (consec_req[i] != -1)
info ("start=%s, end=%s, nodes=%d, cpus=%d, req=%s",
node_record_table_ptr[consec_start[i]].name,
node_record_table_ptr[consec_end[i]].name,
consec_nodes[i], consec_cpus[i],
node_record_table_ptr[consec_req[i]].name);
else
info ("start=%s, end=%s, nodes=%d, cpus=%d",
node_record_table_ptr[consec_start[i]].name,
node_record_table_ptr[consec_end[i]].name,
consec_nodes[i], consec_cpus[i]);
#endif
while (consec_index) {
best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
best_fit_req = -1; /* first required node, -1 if none */
for (i = 0; i < consec_index; i++) {
if (consec_nodes[i] == 0)
continue;
sufficient = ((consec_nodes[i] >= rem_nodes)
&& (consec_cpus[i] >= rem_cpus));
if ((best_fit_nodes == 0) || /* first possibility */
((best_fit_req == -1) && (consec_req[i] != -1)) || /* required nodes */
(sufficient && (best_fit_sufficient == 0)) || /* first large enough */
(sufficient && (consec_cpus[i] < best_fit_cpus)) || /* less waste option */
((sufficient == 0) && (consec_cpus[i] > best_fit_cpus))) { /* larger option */
best_fit_cpus = consec_cpus[i];
best_fit_nodes = consec_nodes[i];
best_fit_location = i;
best_fit_req = consec_req[i];
best_fit_sufficient = sufficient;
}
if (best_fit_nodes == 0)
break;
if (consecutive && ((best_fit_nodes < rem_nodes) || (best_fit_cpus < rem_cpus)))
break; /* no hole large enough */
if (best_fit_req != -1) { /* work out from required nodes */
for (i = best_fit_req;
i <= consec_end[best_fit_location]; i++) {
if ((rem_nodes <= 0) && (rem_cpus <= 0))
break;
if (bit_test (bitmap, i))
continue;
bit_set (bitmap, i);
rem_nodes--;
rem_cpus -= node_record_table_ptr[i].cpus;
for (i = (best_fit_req - 1);
i >= consec_start[best_fit_location]; i--) {
if ((rem_nodes <= 0) && (rem_cpus <= 0))
break;
/* if (bit_test(bitmap, i)) continue; nothing set earlier */
bit_set (bitmap, i);
rem_nodes--;
rem_cpus -= node_record_table_ptr[i].cpus;
}
else {
for (i = consec_start[best_fit_location];
i <= consec_end[best_fit_location]; i++) {
if ((rem_nodes <= 0) && (rem_cpus <= 0))
break;
if (bit_test (bitmap, i))
continue;
bit_set (bitmap, i);
rem_nodes--;
rem_cpus -= node_record_table_ptr[i].cpus;
}
if ((rem_nodes <= 0) && (rem_cpus <= 0)) {
error_code = 0;
break;
}
consec_cpus[best_fit_location] = 0;
consec_nodes[best_fit_location] = 0;
}
cleanup:
if (consec_cpus)
xfree (consec_cpus);
if (consec_nodes)
xfree (consec_nodes);
if (consec_start)
xfree (consec_start);
if (consec_end)
xfree (consec_end);
if (consec_req)
xfree (consec_req);
return error_code;
}
* pick_best_nodes - from nodes satisfying partition and configuration specifications,
* input: node_set_ptr - pointer to node specification information
* node_set_size - number of entries in records pointed to by node_set_ptr
* req_bitmap - pointer to bitmap of specific nodes required by the job, could be NULL
* req_cpus - count of cpus required by the job
* req_nodes - count of nodes required by the job
* contiguous - set to 1 if allocated nodes must be contiguous, 0 otherwise
* shared - set to 1 if nodes may be shared, 0 otherwise
* max_nodes - maximum number of nodes permitted for job, -1 for none (partition limit)
* output: req_bitmap - pointer to bitmap of selected nodes
* returns 0 on success, EAGAIN if request can not be satisfied now,
* EINVAL if request can never be satisfied (insufficient contiguous nodes)
* NOTE: the caller must xfree memory pointed to by req_bitmap
int
pick_best_nodes (struct node_set *node_set_ptr, int node_set_size,
bitstr_t **req_bitmap, int req_cpus, int req_nodes,
int contiguous, int shared, int max_nodes) {
int error_code, i, j, size;
int total_nodes, total_cpus; /* total resources configured in partition */
int avail_nodes, avail_cpus; /* resources available for use now */
bitstr_t *avail_bitmap, *total_bitmap;
int max_feature, min_feature;
int *cpus_per_node;
int avail_set, total_set, runable;
if (node_set_size == 0)
return EINVAL;
if ((max_nodes != -1) && (req_nodes > max_nodes))
return EINVAL;
error_code = 0;
avail_bitmap = total_bitmap = NULL;
avail_nodes = avail_cpus = 0;
total_nodes = total_cpus = 0;
if (req_bitmap[0]) { /* specific nodes required */
/* NOTE: we have already confirmed that all of these nodes have a usable */
/* configuration and are in the proper partition */
if (req_nodes != 0)
total_nodes = bit_set_count (req_bitmap[0]);
if (req_cpus != 0)
total_cpus = count_cpus (req_bitmap[0]);
if (total_nodes > max_nodes)
return EINVAL;
if ((req_nodes <= total_nodes) && (req_cpus <= total_cpus)) {
if (bit_super_set (req_bitmap[0], up_node_bitmap) != 1)
return EAGAIN;
if ((shared != 1) &&
(bit_super_set (req_bitmap[0], idle_node_bitmap) != 1))
return EAGAIN;
return 0; /* user can have selected nodes, we're done! */
}
total_nodes = total_cpus = 0; /* reinitialize */
}
/* identify how many feature sets we have (e.g. "[fs1|fs2|fs3|fs4]" */
max_feature = min_feature = node_set_ptr[0].feature;
for (i = 1; i < node_set_size; i++) {
if (node_set_ptr[i].feature > max_feature)
max_feature = node_set_ptr[i].feature;
if (node_set_ptr[i].feature < min_feature)
min_feature = node_set_ptr[i].feature;
runable = 0; /* assume not runable until otherwise demonstrated */
for (j = min_feature; j <= max_feature; j++) {
avail_set = total_set = 0;
for (i = 0; i < node_set_size; i++) {
if (node_set_ptr[i].feature != j)
continue;
if (runable == 0) {
if (total_set)
bit_or (total_bitmap,
node_set_ptr[i].my_bitmap);
else {
total_bitmap = bit_copy (node_set_ptr[i].my_bitmap);
if (total_bitmap == NULL)
fatal ("bit_copy failed to allocate memory");
total_set = 1;
}
total_nodes += node_set_ptr[i].nodes;
total_cpus +=
(node_set_ptr[i].nodes * node_set_ptr[i].cpus_per_node);
bit_and (node_set_ptr[i].my_bitmap,
up_node_bitmap);
if (shared != 1)
bit_and (node_set_ptr[i].my_bitmap,
idle_node_bitmap);
node_set_ptr[i].nodes =
bit_set_count (node_set_ptr[i].my_bitmap);
if (avail_set)
bit_or (avail_bitmap,
node_set_ptr[i].my_bitmap);
else {
avail_bitmap = bit_copy (node_set_ptr[i].my_bitmap);
if (avail_bitmap == NULL)
fatal ("bit_copy memory allocation failure");
avail_set = 1;
}
avail_nodes += node_set_ptr[i].nodes;
avail_cpus +=
(node_set_ptr[i].nodes *
node_set_ptr[i].cpus_per_node);
if ((req_bitmap[0])
&& (bit_super_set (req_bitmap[0], avail_bitmap)
== 0))
continue;
if (avail_nodes < req_nodes)
continue;
if (avail_cpus < req_cpus)
continue;
error_code =
pick_best_quadrics (avail_bitmap, req_bitmap[0],
req_nodes, req_cpus,
contiguous);
if ((error_code == 0) && (max_nodes != -1)
&& (bit_set_count (avail_bitmap) > max_nodes)) {
error_code = EINVAL;
break;
}
if (error_code == 0) {
if (total_bitmap)
bit_free (total_bitmap);
if (req_bitmap[0])
bit_free (req_bitmap[0]);
req_bitmap[0] = avail_bitmap;
return 0;
}
if ((error_code == 0) && (runable == 0) &&
(total_nodes > req_nodes) && (total_cpus > req_cpus) &&
((req_bitmap[0] == NULL)
|| (bit_super_set (req_bitmap[0], avail_bitmap) == 1))
&& ((max_nodes == -1) || (req_nodes <= max_nodes))) {
/* determine if job could possibly run (if configured nodes all available) */
error_code =
pick_best_quadrics (total_bitmap, req_bitmap[0],
req_nodes, req_cpus,
contiguous);
if ((error_code == 0) && (max_nodes != -1)
&& (bit_set_count (avail_bitmap) > max_nodes))
error_code = EINVAL;
if (error_code == 0)
runable = 1;
}
if (avail_bitmap)
bit_free (avail_bitmap);
if (total_bitmap)
bit_free (total_bitmap);
avail_bitmap = total_bitmap = NULL;
if (error_code != 0)
break;
if (runable == 0)
error_code = EINVAL;
if (error_code == 0)
error_code = EAGAIN;
return error_code;
}
* select_nodes - select and allocate nodes to a job with the given specifications
* input: job_specs - job specifications
* node_list - pointer to node list returned
* output: node_list - list of allocated nodes
* returns 0 on success, EINVAL if not possible to satisfy request,
* or EAGAIN if resources are presently busy
* NOTE: the calling program must xfree the memory pointed to by node_list
int
select_nodes (char *job_specs, char **node_list) {
char *req_features, *req_node_list, *job_name, *req_group,
*req_partition, *out_line;
int contiguous, req_cpus, req_nodes, min_cpus, min_memory,
min_tmp_disk;
int error_code, cpu_tally, node_tally, key, shared;
struct part_record *part_ptr;
bitstr_t *req_bitmap, *scratch_bitmap;
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
ListIterator config_record_iterator; /* for iterating through config_list */
struct config_record *config_record_point; /* pointer to config_record */
int i;
struct node_set *node_set_ptr;
int node_set_index, node_set_size;
req_features = req_node_list = job_name = req_group = req_partition =
NULL;
req_bitmap = scratch_bitmap = NULL;
contiguous = req_cpus = req_nodes = min_cpus = min_memory =
min_tmp_disk = NO_VAL;
key = shared = NO_VAL;
node_set_ptr = NULL;
config_record_iterator = NULL;
node_list[0] = NULL;
config_record_iterator = (ListIterator) NULL;
node_lock ();
part_lock ();
/* setup and basic parsing */
error_code =
parse_job_specs (job_specs, &req_features, &req_node_list,
&job_name, &req_group, &req_partition,
&contiguous, &req_cpus, &req_nodes,
&min_cpus, &min_memory, &min_tmp_disk, &key,
&shared);
if (error_code != 0) {
error_code = EINVAL; /* permanent error, invalid parsing */
error ("select_nodes: parsing failure on %s", job_specs);
goto cleanup;
}
if ((req_cpus == NO_VAL) && (req_nodes == NO_VAL) && (req_node_list == NULL)) {
info ("select_nodes: job failed to specify node_list, cpu or node count");
error_code = EINVAL;
goto cleanup;
}
if (contiguous == NO_VAL)
contiguous = 0; /* default not contiguous */
if (req_cpus == NO_VAL)
req_cpus = 0; /* default no cpu count requirements */
if (req_nodes == NO_VAL)
req_nodes = 0; /* default no node count requirements */
/* find selected partition */
if (req_partition) {
part_ptr =
list_find_first (part_list, &list_find_part,
req_partition);
if (part_ptr == NULL) {
info ("select_nodes: invalid partition specified: %s",
req_partition);
error_code = EINVAL;
goto cleanup;
}
}
else {
if (default_part_loc == NULL) {
error ("select_nodes: default partition not set.");
error_code = EINVAL;
goto cleanup;
}
part_ptr = default_part_loc;
}
/* can this user access this partition */
if (part_ptr->key && (is_key_valid (key) == 0)) {
info ("select_nodes: job lacks key required of partition %s",
part_ptr->name);
error_code = EINVAL;
goto cleanup;
}
if (match_group (part_ptr->allow_groups, req_group) == 0) {
info ("select_nodes: job lacks group required of partition %s",
part_ptr->name);
error_code = EINVAL;
goto cleanup;
}
/* check if select partition has sufficient resources to satisfy request */
if (req_node_list) { /* insure that selected nodes are in this partition */
error_code = node_name2bitmap (req_node_list, &req_bitmap);
if (error_code == EINVAL)
goto cleanup;
if (error_code != 0) {
error_code = EAGAIN; /* no memory */
goto cleanup;
}
if (contiguous == 1)
bit_fill_gaps (req_bitmap);
if (bit_super_set (req_bitmap, part_ptr->node_bitmap) != 1) {
info ("select_nodes: requested nodes %s not in partition %s",
req_node_list, part_ptr->name);
error_code = EINVAL;
goto cleanup;
}
i = count_cpus (req_bitmap);
if (i > req_cpus)
req_cpus = i;
i = bit_set_count (req_bitmap);
if (i > req_nodes)
req_nodes = i;
}
if (req_cpus > part_ptr->total_cpus) {
info ("select_nodes: too many cpus (%d) requested of partition %s(%d)",
req_cpus, part_ptr->name, part_ptr->total_cpus);
error_code = EINVAL;
goto cleanup;
}
if ((req_nodes > part_ptr->total_nodes)
|| (req_nodes > part_ptr->max_nodes)) {
if (part_ptr->total_nodes > part_ptr->max_nodes)
i = part_ptr->max_nodes;
else
i = part_ptr->total_nodes;
info ("select_nodes: too many nodes (%d) requested of partition %s(%d)",
req_nodes, part_ptr->name, i);
error_code = EINVAL;
goto cleanup;
}
if (part_ptr->shared == 2) /* shared=force */
shared = 1;
else if ((shared != 1) || (part_ptr->shared == 0)) /* user or partition want no sharing */
shared = 0;
/* pick up nodes from the weight ordered configuration list */
node_set_index = 0;
node_set_size = 0;
node_set_ptr = (struct node_set *) xmalloc (sizeof (struct node_set));
node_set_ptr[node_set_size++].my_bitmap = NULL;
config_record_iterator = list_iterator_create (config_list);
if (config_record_iterator == NULL) {
fatal ("select_nodes: ListIterator_create unable to allocate memory");
error_code = EAGAIN;
goto cleanup;
}
while (config_record_point =
(struct config_record *) list_next (config_record_iterator)) {
int tmp_feature, check_node_config;
tmp_feature =
valid_features (req_features,
config_record_point->feature);
if (tmp_feature == 0)
continue;
/* since nodes can register with more resources than defined in the configuration, */
/* we want to use those higher values for scheduling, but only as needed */
if ((min_cpus > config_record_point->cpus) ||
(min_memory > config_record_point->real_memory) ||
(min_tmp_disk > config_record_point->tmp_disk))
check_node_config = 1;
else
check_node_config = 0;
node_set_ptr[node_set_index].my_bitmap =
bit_copy (config_record_point->node_bitmap);
if (node_set_ptr[node_set_index].my_bitmap == NULL)
fatal ("bit_copy memory allocation failure");
bit_and (node_set_ptr[node_set_index].my_bitmap,
part_ptr->node_bitmap);
node_set_ptr[node_set_index].nodes =
bit_set_count (node_set_ptr[node_set_index].my_bitmap);
/* check configuration of individual nodes only if the check of baseline */
/* values in the configuration file are too low. this will slow the scheduling */
/* for very large cluster. */
if (check_node_config
&& (node_set_ptr[node_set_index].nodes != 0)) {
for (i = 0; i < node_record_count; i++) {
if (bit_test
(node_set_ptr[node_set_index].my_bitmap, i) == 0)
continue;
if ((min_cpus <=
node_record_table_ptr[i].cpus)
&& (min_memory <=
node_record_table_ptr[i].real_memory)
&& (min_tmp_disk <=
node_record_table_ptr[i].tmp_disk))
continue;
bit_clear (node_set_ptr[node_set_index].my_bitmap, i);
if ((--node_set_ptr[node_set_index].nodes) ==
0)
break;
}
if (node_set_ptr[node_set_index].nodes == 0) {
bit_free (node_set_ptr[node_set_index].my_bitmap);
node_set_ptr[node_set_index].my_bitmap = NULL;
continue;
}
if (req_bitmap) {
if (scratch_bitmap)
bit_or (scratch_bitmap,
node_set_ptr[node_set_index].
my_bitmap);
else {
scratch_bitmap =
bit_copy (node_set_ptr[node_set_index].my_bitmap);
if (scratch_bitmap == NULL)
fatal ("bit_copy memory allocation failure");
}
}
node_set_ptr[node_set_index].cpus_per_node =
config_record_point->cpus;
node_set_ptr[node_set_index].weight =
config_record_point->weight;
node_set_ptr[node_set_index].feature = tmp_feature;
info ("found %d usable nodes from configuration with %s",
node_set_ptr[node_set_index].nodes,
config_record_point->nodes);
node_set_index++;
xrealloc (node_set_ptr, sizeof (struct node_set) * (node_set_index + 1));
node_set_ptr[node_set_size++].my_bitmap = NULL;
}
if (node_set_index == 0) {
info ("select_nodes: no node configurations satisfy requirements %d:%d:%d:%s",
min_cpus, min_memory, min_tmp_disk, req_features);
error_code = EINVAL;
goto cleanup;
}
if (node_set_ptr[node_set_index].my_bitmap)
bit_free (node_set_ptr[node_set_index].my_bitmap);
node_set_ptr[node_set_index].my_bitmap = NULL;
node_set_size = node_set_index;
if (req_bitmap) {
if ((scratch_bitmap == NULL)
|| (bit_super_set (req_bitmap, scratch_bitmap) != 1)) {
info ("select_nodes: requested nodes do not satisfy configurations requirements %d:%d:%d:%s",
min_cpus, min_memory, min_tmp_disk, req_features);
error_code = EINVAL;
goto cleanup;
}
}
/* pick the nodes providing a best-fit */
error_code = pick_best_nodes (node_set_ptr, node_set_size,
&req_bitmap, req_cpus, req_nodes,
contiguous, shared,
part_ptr->max_nodes);
if (error_code == EAGAIN)
goto cleanup;
if (error_code == EINVAL) {
info ("select_nodes: no nodes can satisfy job request");
goto cleanup;
}