/* * node_scheduler.c - select and allocated nodes to jobs * see slurm.h for documentation on external functions and data structures * * NOTE: DEBUG_MODULE mode test with execution line * node_scheduler ../../etc/slurm.conf2 ../../etc/slurm.jobs * * author: moe jette, jette@llnl.gov */ #ifdef HAVE_CONFIG_H # include <config.h> #endif #include <errno.h> #include <stdio.h> #include <stdlib.h> #include <syslog.h> #include "slurm.h" #define BUF_SIZE 1024 #define NO_VAL (-99) struct node_set { /* set of nodes with same configuration that could be allocated */ int cpus_per_node; int nodes; int weight; int feature; bitstr_t *my_bitmap; }; int count_cpus (unsigned *bitmap); int is_key_valid (int key); int match_group (char *allow_groups, char *user_groups); int match_feature (char *seek, char *available); int parse_job_specs (char *job_specs, char **req_features, char **req_node_list, char **job_name, char **req_group, char **req_partition, int *contiguous, int *req_cpus, int *req_nodes, int *min_cpus, int *min_memory, int *min_tmp_disk, int *key, int *shared); int pick_best_quadrics (bitstr_t *bitmap, bitstr_t *req_bitmap, int req_nodes, int req_cpus, int consecutive); int pick_best_nodes (struct node_set *node_set_ptr, int node_set_size, bitstr_t **req_bitmap, int req_cpus, int req_nodes, int contiguous, int shared, int max_nodes); int valid_features (char *requested, char *available); #if DEBUG_MODULE /* main is used here for testing purposes only */ main (int argc, char *argv[]) { int error_code, line_num, i; FILE *command_file; char in_line[BUF_SIZE], *node_list; log_options_t opts = LOG_OPTS_STDERR_ONLY; log_init(argv[0], opts, SYSLOG_FACILITY_DAEMON, NULL); if (argc < 3) { printf ("usage: %s <SLURM_CONF_file> <slurm_job_file>\n", argv[0]); exit (1); } error_code = init_slurm_conf (); if (error_code) { printf ("controller: error %d from init_slurm_conf\n", error_code); exit (error_code); } error_code = read_slurm_conf (argv[1]); if (error_code) { printf ("controller: error %d from read_slurm_conf\n", error_code); exit (error_code); } /* mark everything up and idle for testing */ bit_nset (idle_node_bitmap, 0, node_record_count-1); bit_nset (up_node_bitmap, 0, node_record_count-1); command_file = fopen (argv[2], "r"); if (command_file == NULL) { fprintf (stderr, "node_scheduler: error %d opening command file %s\n", errno, argv[2]); exit (1); } i = valid_features ("fs1&fs2", "fs1"); if (i != 0) printf ("valid_features error 1\n"); i = valid_features ("fs1|fs2", "fs1"); if (i != 1) printf ("valid_features error 2\n"); i = valid_features ("fs1|fs2&fs3", "fs1,fs3"); if (i != 1) printf ("valid_features error 3\n"); i = valid_features ("[fs1|fs2]&fs3", "fs2,fs3"); if (i != 2) printf ("valid_features error 4\n"); i = valid_features ("fs0&[fs1|fs2]&fs3", "fs2,fs3"); if (i != 0) printf ("valid_features error 5\n"); i = valid_features ("fs3&[fs1|fs2]&fs3", "fs2,fs3"); if (i != 2) printf ("valid_features error 6\n"); line_num = 0; printf ("\n"); while (fgets (in_line, BUF_SIZE, command_file)) { if (in_line[strlen (in_line) - 1] == '\n') in_line[strlen (in_line) - 1] = (char) NULL; line_num++; error_code = select_nodes (in_line, &node_list); if (error_code) { if (strncmp (in_line, "JobName=FAIL", 12) != 0) printf ("ERROR:"); printf ("for job: %s\n", in_line, node_list); printf ("node_scheduler: error %d from select_nodes on line %d\n\n", error_code, line_num); } else { if (strncmp (in_line, "job_name=fail", 12) == 0) printf ("ERROR: "); printf ("for job: %s\n nodes selected %s\n\n", in_line, node_list); xfree (node_list); } } } #endif /* for a given bitmap, change the state of specified nodes to stage_in */ /* this is a simple prototype for testing * globals: node_record_count - number of nodes in the system * node_record_table_ptr - pointer to global node table */ void allocate_nodes (unsigned *bitmap) { int i; for (i = 0; i < node_record_count; i++) { if (bit_test (bitmap, i) == 0) continue; node_record_table_ptr[i].node_state = STATE_STAGE_IN; bit_clear (idle_node_bitmap, i); } return; } /* * count_cpus - report how many cpus are associated with the identified nodes * input: bitmap - a node bitmap * output: returns a cpu count * globals: node_record_count - number of nodes configured * node_record_table_ptr - pointer to global node table */ int count_cpus (unsigned *bitmap) { int i, sum; sum = 0; for (i = 0; i < node_record_count; i++) { if (bit_test (bitmap, i) != 1) continue; sum += node_record_table_ptr[i].cpus; } return sum; } /* * is_key_valid - determine if supplied key is valid * input: key - a slurm key acquired by user root * output: returns 1 if key is valid, 0 otherwise * NOTE: this is only a placeholder for a future function */ int is_key_valid (int key) { if (key == NO_VAL) return 0; return 1; } /* * match_feature - determine if the desired feature (seek) is one of those available * input: seek - desired feature * available - comma separated list of features * output: returns 1 if found, 0 otherwise */ int match_feature (char *seek, char *available) { char *tmp_available, *str_ptr3, *str_ptr4; int found; if (seek == NULL) return 1; /* nothing to look for */ if (available == NULL) return 0; /* nothing to find */ tmp_available = xmalloc (strlen (available) + 1); strcpy (tmp_available, available); found = 0; str_ptr3 = (char *) strtok_r (tmp_available, ",", &str_ptr4); while (str_ptr3) { if (strcmp (seek, str_ptr3) == 0) { /* we have a match */ found = 1; break; } str_ptr3 = (char *) strtok_r (NULL, ",", &str_ptr4); } xfree (tmp_available); return found; } /* * match_group - determine if the user is a member of any groups permitted to use this partition * input: allow_groups - comma delimited list of groups permitted to use the partition, * NULL is for all groups * user_groups - comma delimited list of groups the user belongs to * output: returns 1 if user is member, 0 otherwise */ int match_group (char *allow_groups, char *user_groups) { char *tmp_allow_group, *str_ptr1, *str_ptr2; char *tmp_user_group, *str_ptr3, *str_ptr4; if ((allow_groups == NULL) || /* anybody can use it */ (strcmp (allow_groups, "all") == 0)) return 1; if (user_groups == NULL) return 0; /* empty group list */ tmp_allow_group = xmalloc (strlen (allow_groups) + 1); strcpy (tmp_allow_group, allow_groups); tmp_user_group = xmalloc (strlen (user_groups) + 1); str_ptr1 = (char *) strtok_r (tmp_allow_group, ",", &str_ptr2); while (str_ptr1) { strcpy (tmp_user_group, user_groups); str_ptr3 = (char *) strtok_r (tmp_user_group, ",", &str_ptr4); while (str_ptr3) { if (strcmp (str_ptr1, str_ptr3) == 0) { /* we have a match */ xfree (tmp_allow_group); xfree (tmp_user_group); return 1; } str_ptr3 = (char *) strtok_r (NULL, ",", &str_ptr4); } str_ptr1 = (char *) strtok_r (NULL, ",", &str_ptr2); } xfree (tmp_allow_group); xfree (tmp_user_group); return 0; /* no match */ } /* * parse_job_specs - pick the appropriate fields out of a job request specification * input: job_specs - string containing the specification * req_features, etc. - pointers to storage for the specifications * output: req_features, etc. - the job's specifications * returns 0 if no error, errno otherwise * NOTE: the calling function must xfree memory at req_features[0], req_node_list[0], * job_name[0], req_group[0], and req_partition[0] */ int parse_job_specs (char *job_specs, char **req_features, char **req_node_list, char **job_name, char **req_group, char **req_partition, int *contiguous, int *req_cpus, int *req_nodes, int *min_cpus, int *min_memory, int *min_tmp_disk, int *key, int *shared) { int bad_index, error_code, i; char *temp_specs; req_features[0] = req_node_list[0] = req_group[0] = req_partition[0] = job_name[0] = NULL; *contiguous = *req_cpus = *req_nodes = *min_cpus = *min_memory = *min_tmp_disk = NO_VAL; *key = *shared = NO_VAL; temp_specs = xmalloc (strlen (job_specs) + 1); strcpy (temp_specs, job_specs); error_code = load_string (job_name, "JobName=", temp_specs); if (error_code) goto cleanup; error_code = load_string (req_features, "Features=", temp_specs); if (error_code) goto cleanup; error_code = load_string (req_node_list, "NodeList=", temp_specs); if (error_code) goto cleanup; error_code = load_string (req_group, "Groups=", temp_specs); if (error_code) goto cleanup; error_code = load_string (req_partition, "Partition=", temp_specs); if (error_code) goto cleanup; error_code = load_integer (contiguous, "Contiguous", temp_specs); if (error_code) goto cleanup; error_code = load_integer (req_cpus, "TotalCPUs=", temp_specs); if (error_code) goto cleanup; error_code = load_integer (req_nodes, "TotalNodes=", temp_specs); if (error_code) goto cleanup; error_code = load_integer (min_cpus, "MinCPUs=", temp_specs); if (error_code) goto cleanup; error_code = load_integer (min_memory, "MinMemory=", temp_specs); if (error_code) goto cleanup; error_code = load_integer (min_tmp_disk, "MinTmpDisk=", temp_specs); if (error_code) goto cleanup; error_code = load_integer (key, "Key=", temp_specs); if (error_code) goto cleanup; error_code = load_integer (shared, "Shared=", temp_specs); if (error_code) goto cleanup; bad_index = -1; for (i = 0; i < strlen (temp_specs); i++) { if (isspace ((int) temp_specs[i]) || (temp_specs[i] == '\n')) continue; bad_index = i; break; } if (bad_index != -1) { error ("parse_job_specs: bad job specification input: %s", &temp_specs[bad_index]); error_code = EINVAL; } xfree (temp_specs); return error_code; cleanup: xfree (temp_specs); if (req_features[0]) xfree (req_features[0]); if (req_node_list[0]) xfree (req_node_list[0]); if (req_group[0]) xfree (req_group[0]); if (req_partition[0]) xfree (req_partition[0]); if (job_name[0]) xfree (job_name[0]); req_features[0] = req_node_list[0] = req_group[0] = req_partition[0] = job_name[0] = NULL; } /* * pick_best_quadrics - identify the nodes which best fit the req_nodes and req_cpus counts * for a system with Quadrics elan interconnect. * "best" is defined as either single set of consecutive nodes satisfying * the request and leaving the minimum number of unused nodes OR * the fewest number of consecutive node sets * input: bitmap - the bit map to search * req_bitmap - the bit map of nodes that must be selected, if not NULL these * have already been confirmed to be in the input bitmap * req_nodes - number of nodes required * req_cpus - number of cpus required * consecutive - nodes must be consecutive is 1, otherwise 0 * output: bitmap - nodes not required to satisfy the request are cleared, other left set * returns zero on success, EINVAL otherwise * globals: node_record_count - count of nodes configured * node_record_table_ptr - pointer to global node table * NOTE: bitmap must be a superset of req_nodes at function call time */ int pick_best_quadrics (bitstr_t *bitmap, bitstr_t *req_bitmap, int req_nodes, int req_cpus, int consecutive) { int i, index, error_code, sufficient; int *consec_nodes; /* how many nodes we can add from this consecutive set of nodes */ int *consec_cpus; /* how many nodes we can add from this consecutive set of nodes */ int *consec_start; /* where this consecutive set starts (index) */ int *consec_end; /* where this consecutive set ends (index) */ int *consec_req; /* are nodes from this set required (in req_bitmap) */ int consec_index, consec_size; int rem_cpus, rem_nodes; /* remaining resources required */ int best_fit_nodes, best_fit_cpus, best_fit_req, best_fit_location, best_fit_sufficient; if (bitmap == NULL) fatal ("pick_best_quadrics: bitmap pointer is NULL\n"); error_code = EINVAL; /* default is no fit */ consec_index = 0; consec_size = 50; /* start allocation for 50 sets of consecutive nodes */ consec_cpus = xmalloc (sizeof (int) * consec_size); consec_nodes = xmalloc (sizeof (int) * consec_size); consec_start = xmalloc (sizeof (int) * consec_size); consec_end = xmalloc (sizeof (int) * consec_size); consec_req = xmalloc (sizeof (int) * consec_size); consec_cpus[consec_index] = consec_nodes[consec_index] = 0; consec_req[consec_index] = -1; /* no required nodes here by default */ rem_cpus = req_cpus; rem_nodes = req_nodes; for (index = 0; index < node_record_count; index++) { if (bit_test (bitmap, index)) { if (consec_nodes[consec_index] == 0) consec_start[consec_index] = index; i = node_record_table_ptr[index].cpus; if (req_bitmap && bit_test (req_bitmap, index)) { if (consec_req[consec_index] == -1) /* first required node in set */ consec_req[consec_index] = index; rem_cpus -= i; /* reduce count of additional resources required */ rem_nodes--; /* reduce count of additional resources required */ } else { bit_clear (bitmap, index); consec_cpus[consec_index] += i; consec_nodes[consec_index]++; } } else if (consec_nodes[consec_index] == 0) { consec_req[consec_index] = -1; /* already picked up any required nodes */ /* re-use this record */ } else { consec_end[consec_index] = index - 1; if (++consec_index >= consec_size) { consec_size *= 2; xrealloc (consec_cpus, sizeof (int) * consec_size); xrealloc (consec_nodes, sizeof (int) * consec_size); xrealloc (consec_start, sizeof (int) * consec_size); xrealloc (consec_end, sizeof (int) * consec_size); xrealloc (consec_req, sizeof (int) * consec_size); } consec_cpus[consec_index] = 0; consec_nodes[consec_index] = 0; consec_req[consec_index] = -1; } } if (consec_nodes[consec_index] != 0) consec_end[consec_index++] = index - 1; #if DEBUG_SYSTEM > 1 info ("rem_cpus=%d, rem_nodes=%d", rem_cpus, rem_nodes); for (i = 0; i < consec_index; i++) { if (consec_req[i] != -1) info ("start=%s, end=%s, nodes=%d, cpus=%d, req=%s", node_record_table_ptr[consec_start[i]].name, node_record_table_ptr[consec_end[i]].name, consec_nodes[i], consec_cpus[i], node_record_table_ptr[consec_req[i]].name); else info ("start=%s, end=%s, nodes=%d, cpus=%d", node_record_table_ptr[consec_start[i]].name, node_record_table_ptr[consec_end[i]].name, consec_nodes[i], consec_cpus[i]); } #endif while (consec_index) { best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0; best_fit_req = -1; /* first required node, -1 if none */ for (i = 0; i < consec_index; i++) { if (consec_nodes[i] == 0) continue; sufficient = ((consec_nodes[i] >= rem_nodes) && (consec_cpus[i] >= rem_cpus)); if ((best_fit_nodes == 0) || /* first possibility */ ((best_fit_req == -1) && (consec_req[i] != -1)) || /* required nodes */ (sufficient && (best_fit_sufficient == 0)) || /* first large enough */ (sufficient && (consec_cpus[i] < best_fit_cpus)) || /* less waste option */ ((sufficient == 0) && (consec_cpus[i] > best_fit_cpus))) { /* larger option */ best_fit_cpus = consec_cpus[i]; best_fit_nodes = consec_nodes[i]; best_fit_location = i; best_fit_req = consec_req[i]; best_fit_sufficient = sufficient; } } if (best_fit_nodes == 0) break; if (consecutive && ((best_fit_nodes < rem_nodes) || (best_fit_cpus < rem_cpus))) break; /* no hole large enough */ if (best_fit_req != -1) { /* work out from required nodes */ for (i = best_fit_req; i <= consec_end[best_fit_location]; i++) { if ((rem_nodes <= 0) && (rem_cpus <= 0)) break; if (bit_test (bitmap, i)) continue; bit_set (bitmap, i); rem_nodes--; rem_cpus -= node_record_table_ptr[i].cpus; } for (i = (best_fit_req - 1); i >= consec_start[best_fit_location]; i--) { if ((rem_nodes <= 0) && (rem_cpus <= 0)) break; /* if (bit_test(bitmap, i)) continue; nothing set earlier */ bit_set (bitmap, i); rem_nodes--; rem_cpus -= node_record_table_ptr[i].cpus; } } else { for (i = consec_start[best_fit_location]; i <= consec_end[best_fit_location]; i++) { if ((rem_nodes <= 0) && (rem_cpus <= 0)) break; if (bit_test (bitmap, i)) continue; bit_set (bitmap, i); rem_nodes--; rem_cpus -= node_record_table_ptr[i].cpus; } } if ((rem_nodes <= 0) && (rem_cpus <= 0)) { error_code = 0; break; } consec_cpus[best_fit_location] = 0; consec_nodes[best_fit_location] = 0; } cleanup: if (consec_cpus) xfree (consec_cpus); if (consec_nodes) xfree (consec_nodes); if (consec_start) xfree (consec_start); if (consec_end) xfree (consec_end); if (consec_req) xfree (consec_req); return error_code; } /* * pick_best_nodes - from nodes satisfying partition and configuration specifications, * select the "best" for use * input: node_set_ptr - pointer to node specification information * node_set_size - number of entries in records pointed to by node_set_ptr * req_bitmap - pointer to bitmap of specific nodes required by the job, could be NULL * req_cpus - count of cpus required by the job * req_nodes - count of nodes required by the job * contiguous - set to 1 if allocated nodes must be contiguous, 0 otherwise * shared - set to 1 if nodes may be shared, 0 otherwise * max_nodes - maximum number of nodes permitted for job, -1 for none (partition limit) * output: req_bitmap - pointer to bitmap of selected nodes * returns 0 on success, EAGAIN if request can not be satisfied now, * EINVAL if request can never be satisfied (insufficient contiguous nodes) * NOTE: the caller must xfree memory pointed to by req_bitmap */ int pick_best_nodes (struct node_set *node_set_ptr, int node_set_size, bitstr_t **req_bitmap, int req_cpus, int req_nodes, int contiguous, int shared, int max_nodes) { int error_code, i, j, size; int total_nodes, total_cpus; /* total resources configured in partition */ int avail_nodes, avail_cpus; /* resources available for use now */ bitstr_t *avail_bitmap, *total_bitmap; int max_feature, min_feature; int *cpus_per_node; int avail_set, total_set, runable; if (node_set_size == 0) return EINVAL; if ((max_nodes != -1) && (req_nodes > max_nodes)) return EINVAL; error_code = 0; avail_bitmap = total_bitmap = NULL; avail_nodes = avail_cpus = 0; total_nodes = total_cpus = 0; if (req_bitmap[0]) { /* specific nodes required */ /* NOTE: we have already confirmed that all of these nodes have a usable */ /* configuration and are in the proper partition */ if (req_nodes != 0) total_nodes = bit_set_count (req_bitmap[0]); if (req_cpus != 0) total_cpus = count_cpus (req_bitmap[0]); if (total_nodes > max_nodes) return EINVAL; if ((req_nodes <= total_nodes) && (req_cpus <= total_cpus)) { if (bit_super_set (req_bitmap[0], up_node_bitmap) != 1) return EAGAIN; if ((shared != 1) && (bit_super_set (req_bitmap[0], idle_node_bitmap) != 1)) return EAGAIN; return 0; /* user can have selected nodes, we're done! */ } total_nodes = total_cpus = 0; /* reinitialize */ } /* identify how many feature sets we have (e.g. "[fs1|fs2|fs3|fs4]" */ max_feature = min_feature = node_set_ptr[0].feature; for (i = 1; i < node_set_size; i++) { if (node_set_ptr[i].feature > max_feature) max_feature = node_set_ptr[i].feature; if (node_set_ptr[i].feature < min_feature) min_feature = node_set_ptr[i].feature; } runable = 0; /* assume not runable until otherwise demonstrated */ for (j = min_feature; j <= max_feature; j++) { avail_set = total_set = 0; for (i = 0; i < node_set_size; i++) { if (node_set_ptr[i].feature != j) continue; if (runable == 0) { if (total_set) bit_or (total_bitmap, node_set_ptr[i].my_bitmap); else { total_bitmap = bit_copy (node_set_ptr[i].my_bitmap); if (total_bitmap == NULL) fatal ("bit_copy failed to allocate memory"); total_set = 1; } total_nodes += node_set_ptr[i].nodes; total_cpus += (node_set_ptr[i].nodes * node_set_ptr[i].cpus_per_node); } bit_and (node_set_ptr[i].my_bitmap, up_node_bitmap); if (shared != 1) bit_and (node_set_ptr[i].my_bitmap, idle_node_bitmap); node_set_ptr[i].nodes = bit_set_count (node_set_ptr[i].my_bitmap); if (avail_set) bit_or (avail_bitmap, node_set_ptr[i].my_bitmap); else { avail_bitmap = bit_copy (node_set_ptr[i].my_bitmap); if (avail_bitmap == NULL) fatal ("bit_copy memory allocation failure"); avail_set = 1; } avail_nodes += node_set_ptr[i].nodes; avail_cpus += (node_set_ptr[i].nodes * node_set_ptr[i].cpus_per_node); if ((req_bitmap[0]) && (bit_super_set (req_bitmap[0], avail_bitmap) == 0)) continue; if (avail_nodes < req_nodes) continue; if (avail_cpus < req_cpus) continue; error_code = pick_best_quadrics (avail_bitmap, req_bitmap[0], req_nodes, req_cpus, contiguous); if ((error_code == 0) && (max_nodes != -1) && (bit_set_count (avail_bitmap) > max_nodes)) { error_code = EINVAL; break; } if (error_code == 0) { if (total_bitmap) bit_free (total_bitmap); if (req_bitmap[0]) bit_free (req_bitmap[0]); req_bitmap[0] = avail_bitmap; return 0; } } if ((error_code == 0) && (runable == 0) && (total_nodes > req_nodes) && (total_cpus > req_cpus) && ((req_bitmap[0] == NULL) || (bit_super_set (req_bitmap[0], avail_bitmap) == 1)) && ((max_nodes == -1) || (req_nodes <= max_nodes))) { /* determine if job could possibly run (if configured nodes all available) */ error_code = pick_best_quadrics (total_bitmap, req_bitmap[0], req_nodes, req_cpus, contiguous); if ((error_code == 0) && (max_nodes != -1) && (bit_set_count (avail_bitmap) > max_nodes)) error_code = EINVAL; if (error_code == 0) runable = 1; } if (avail_bitmap) bit_free (avail_bitmap); if (total_bitmap) bit_free (total_bitmap); avail_bitmap = total_bitmap = NULL; if (error_code != 0) break; } if (runable == 0) error_code = EINVAL; if (error_code == 0) error_code = EAGAIN; return error_code; } /* * select_nodes - select and allocate nodes to a job with the given specifications * input: job_specs - job specifications * node_list - pointer to node list returned * output: node_list - list of allocated nodes * returns 0 on success, EINVAL if not possible to satisfy request, * or EAGAIN if resources are presently busy * NOTE: the calling program must xfree the memory pointed to by node_list */ int select_nodes (char *job_specs, char **node_list) { char *req_features, *req_node_list, *job_name, *req_group, *req_partition, *out_line; int contiguous, req_cpus, req_nodes, min_cpus, min_memory, min_tmp_disk; int error_code, cpu_tally, node_tally, key, shared; struct part_record *part_ptr; bitstr_t *req_bitmap, *scratch_bitmap; ListIterator config_record_iterator; /* for iterating through config_list */ struct config_record *config_record_point; /* pointer to config_record */ int i; struct node_set *node_set_ptr; int node_set_index, node_set_size; req_features = req_node_list = job_name = req_group = req_partition = NULL; req_bitmap = scratch_bitmap = NULL; contiguous = req_cpus = req_nodes = min_cpus = min_memory = min_tmp_disk = NO_VAL; key = shared = NO_VAL; node_set_ptr = NULL; config_record_iterator = NULL; node_list[0] = NULL; config_record_iterator = (ListIterator) NULL; node_lock (); part_lock (); /* setup and basic parsing */ error_code = parse_job_specs (job_specs, &req_features, &req_node_list, &job_name, &req_group, &req_partition, &contiguous, &req_cpus, &req_nodes, &min_cpus, &min_memory, &min_tmp_disk, &key, &shared); if (error_code != 0) { error_code = EINVAL; /* permanent error, invalid parsing */ error ("select_nodes: parsing failure on %s", job_specs); goto cleanup; } if ((req_cpus == NO_VAL) && (req_nodes == NO_VAL) && (req_node_list == NULL)) { info ("select_nodes: job failed to specify node_list, cpu or node count"); error_code = EINVAL; goto cleanup; } if (contiguous == NO_VAL) contiguous = 0; /* default not contiguous */ if (req_cpus == NO_VAL) req_cpus = 0; /* default no cpu count requirements */ if (req_nodes == NO_VAL) req_nodes = 0; /* default no node count requirements */ /* find selected partition */ if (req_partition) { part_ptr = list_find_first (part_list, &list_find_part, req_partition); if (part_ptr == NULL) { info ("select_nodes: invalid partition specified: %s", req_partition); error_code = EINVAL; goto cleanup; } } else { if (default_part_loc == NULL) { error ("select_nodes: default partition not set."); error_code = EINVAL; goto cleanup; } part_ptr = default_part_loc; } /* can this user access this partition */ if (part_ptr->key && (is_key_valid (key) == 0)) { info ("select_nodes: job lacks key required of partition %s", part_ptr->name); error_code = EINVAL; goto cleanup; } if (match_group (part_ptr->allow_groups, req_group) == 0) { info ("select_nodes: job lacks group required of partition %s", part_ptr->name); error_code = EINVAL; goto cleanup; } /* check if select partition has sufficient resources to satisfy request */ if (req_node_list) { /* insure that selected nodes are in this partition */ error_code = node_name2bitmap (req_node_list, &req_bitmap); if (error_code == EINVAL) goto cleanup; if (error_code != 0) { error_code = EAGAIN; /* no memory */ goto cleanup; } if (contiguous == 1) bit_fill_gaps (req_bitmap); if (bit_super_set (req_bitmap, part_ptr->node_bitmap) != 1) { info ("select_nodes: requested nodes %s not in partition %s", req_node_list, part_ptr->name); error_code = EINVAL; goto cleanup; } i = count_cpus (req_bitmap); if (i > req_cpus) req_cpus = i; i = bit_set_count (req_bitmap); if (i > req_nodes) req_nodes = i; } if (req_cpus > part_ptr->total_cpus) { info ("select_nodes: too many cpus (%d) requested of partition %s(%d)", req_cpus, part_ptr->name, part_ptr->total_cpus); error_code = EINVAL; goto cleanup; } if ((req_nodes > part_ptr->total_nodes) || (req_nodes > part_ptr->max_nodes)) { if (part_ptr->total_nodes > part_ptr->max_nodes) i = part_ptr->max_nodes; else i = part_ptr->total_nodes; info ("select_nodes: too many nodes (%d) requested of partition %s(%d)", req_nodes, part_ptr->name, i); error_code = EINVAL; goto cleanup; } if (part_ptr->shared == 2) /* shared=force */ shared = 1; else if ((shared != 1) || (part_ptr->shared == 0)) /* user or partition want no sharing */ shared = 0; /* pick up nodes from the weight ordered configuration list */ node_set_index = 0; node_set_size = 0; node_set_ptr = (struct node_set *) xmalloc (sizeof (struct node_set)); node_set_ptr[node_set_size++].my_bitmap = NULL; config_record_iterator = list_iterator_create (config_list); if (config_record_iterator == NULL) { fatal ("select_nodes: ListIterator_create unable to allocate memory"); error_code = EAGAIN; goto cleanup; } while (config_record_point = (struct config_record *) list_next (config_record_iterator)) { int tmp_feature, check_node_config; tmp_feature = valid_features (req_features, config_record_point->feature); if (tmp_feature == 0) continue; /* since nodes can register with more resources than defined in the configuration, */ /* we want to use those higher values for scheduling, but only as needed */ if ((min_cpus > config_record_point->cpus) || (min_memory > config_record_point->real_memory) || (min_tmp_disk > config_record_point->tmp_disk)) check_node_config = 1; else check_node_config = 0; node_set_ptr[node_set_index].my_bitmap = bit_copy (config_record_point->node_bitmap); if (node_set_ptr[node_set_index].my_bitmap == NULL) fatal ("bit_copy memory allocation failure"); bit_and (node_set_ptr[node_set_index].my_bitmap, part_ptr->node_bitmap); node_set_ptr[node_set_index].nodes = bit_set_count (node_set_ptr[node_set_index].my_bitmap); /* check configuration of individual nodes only if the check of baseline */ /* values in the configuration file are too low. this will slow the scheduling */ /* for very large cluster. */ if (check_node_config && (node_set_ptr[node_set_index].nodes != 0)) { for (i = 0; i < node_record_count; i++) { if (bit_test (node_set_ptr[node_set_index].my_bitmap, i) == 0) continue; if ((min_cpus <= node_record_table_ptr[i].cpus) && (min_memory <= node_record_table_ptr[i].real_memory) && (min_tmp_disk <= node_record_table_ptr[i].tmp_disk)) continue; bit_clear (node_set_ptr[node_set_index].my_bitmap, i); if ((--node_set_ptr[node_set_index].nodes) == 0) break; } } if (node_set_ptr[node_set_index].nodes == 0) { bit_free (node_set_ptr[node_set_index].my_bitmap); node_set_ptr[node_set_index].my_bitmap = NULL; continue; } if (req_bitmap) { if (scratch_bitmap) bit_or (scratch_bitmap, node_set_ptr[node_set_index]. my_bitmap); else { scratch_bitmap = bit_copy (node_set_ptr[node_set_index].my_bitmap); if (scratch_bitmap == NULL) fatal ("bit_copy memory allocation failure"); } } node_set_ptr[node_set_index].cpus_per_node = config_record_point->cpus; node_set_ptr[node_set_index].weight = config_record_point->weight; node_set_ptr[node_set_index].feature = tmp_feature; #if DEBUG_MODULE > 1 info ("found %d usable nodes from configuration with %s", node_set_ptr[node_set_index].nodes, config_record_point->nodes); #endif node_set_index++; xrealloc (node_set_ptr, sizeof (struct node_set) * (node_set_index + 1)); node_set_ptr[node_set_size++].my_bitmap = NULL; } if (node_set_index == 0) { info ("select_nodes: no node configurations satisfy requirements %d:%d:%d:%s", min_cpus, min_memory, min_tmp_disk, req_features); error_code = EINVAL; goto cleanup; } if (node_set_ptr[node_set_index].my_bitmap) bit_free (node_set_ptr[node_set_index].my_bitmap); node_set_ptr[node_set_index].my_bitmap = NULL; node_set_size = node_set_index; if (req_bitmap) { if ((scratch_bitmap == NULL) || (bit_super_set (req_bitmap, scratch_bitmap) != 1)) { info ("select_nodes: requested nodes do not satisfy configurations requirements %d:%d:%d:%s", min_cpus, min_memory, min_tmp_disk, req_features); error_code = EINVAL; goto cleanup; } } /* pick the nodes providing a best-fit */ error_code = pick_best_nodes (node_set_ptr, node_set_size, &req_bitmap, req_cpus, req_nodes, contiguous, shared, part_ptr->max_nodes); if (error_code == EAGAIN) goto cleanup; if (error_code == EINVAL) { info ("select_nodes: no nodes can satisfy job request"); goto cleanup; } /* mark the selected nodes as STATE_STAGE_IN */ allocate_nodes (req_bitmap); error_code = bitmap2node_name (req_bitmap, node_list); if (error_code) error ("bitmap2node_name error %d", error_code); cleanup: part_unlock (); node_unlock (); if (req_features) xfree (req_features); if (req_node_list) xfree (req_node_list); if (job_name) xfree (job_name); if (req_group) xfree (req_group); if (req_partition) xfree (req_partition); if (req_bitmap) bit_free (req_bitmap); if (scratch_bitmap) bit_free (scratch_bitmap); if (node_set_ptr) { for (i = 0; i < node_set_size; i++) { if (node_set_ptr[i].my_bitmap) bit_free (node_set_ptr[i].my_bitmap); } xfree (node_set_ptr); } if (config_record_iterator) list_iterator_destroy (config_record_iterator); return error_code; } /* valid_features - determine if the requested features are satisfied by those available * input: requested - requested features (by a job) * available - available features (on a node) * output: returns 0 if request is not satisfied, otherwise an integer indicating * which mutually exclusive feature is satisfied. for example * valid_features("[fs1|fs2|fs3|fs4]", "fs3") returns 3. see the * slurm administrator and user guides for details. returns 1 if * requirements are satisfied without mutually exclusive feature list. */ int valid_features (char *requested, char *available) { char *tmp_requested, *str_ptr1; int bracket, found, i, option, position, result; int last_op; /* last operation 0 for or, 1 for and */ int save_op, save_result; /* for bracket support */ if (requested == NULL) return 1; /* no constraints */ if (available == NULL) return 0; /* no features */ tmp_requested = xmalloc (strlen (requested) + 1); strcpy (tmp_requested, requested); bracket = option = position = 0; str_ptr1 = tmp_requested; /* start of feature name */ result = last_op = 1; /* assume good for now */ for (i = 0;; i++) { if (tmp_requested[i] == (char) NULL) { if (strlen (str_ptr1) == 0) break; found = match_feature (str_ptr1, available); if (last_op == 1) /* and */ result &= found; else /* or */ result |= found; break; } if (tmp_requested[i] == '&') { if (bracket != 0) { info ("valid_features: parsing failure 1 on %s", requested); result = 0; break; } tmp_requested[i] = (char) NULL; found = match_feature (str_ptr1, available); if (last_op == 1) /* and */ result &= found; else /* or */ result |= found; str_ptr1 = &tmp_requested[i + 1]; last_op = 1; /* and */ } else if (tmp_requested[i] == '|') { tmp_requested[i] = (char) NULL; found = match_feature (str_ptr1, available); if (bracket != 0) { if (found) option = position; position++; } if (last_op == 1) /* and */ result &= found; else /* or */ result |= found; str_ptr1 = &tmp_requested[i + 1]; last_op = 0; /* or */ } else if (tmp_requested[i] == '[') { bracket++; position = 1; save_op = last_op; save_result = result; last_op = result = 1; str_ptr1 = &tmp_requested[i + 1]; } else if (tmp_requested[i] == ']') { tmp_requested[i] = (char) NULL; found = match_feature (str_ptr1, available); if (found) option = position; result |= found; if (save_op == 1) /* and */ result &= save_result; else /* or */ result |= save_result; if ((tmp_requested[i + 1] == '&') && (bracket == 1)) { last_op = 1; str_ptr1 = &tmp_requested[i + 2]; } else if ((tmp_requested[i + 1] == '|') && (bracket == 1)) { last_op = 0; str_ptr1 = &tmp_requested[i + 2]; } else if ((tmp_requested[i + 1] == (char) NULL) && (bracket == 1)) { break; } else { error ("valid_features: parsing failure 2 on %s", requested); result = 0; break; } bracket = 0; } } if (position) result *= option; xfree (tmp_requested); return result; }