Newer
Older
/*****************************************************************************\
* node_scheduler.c - select and allocated nodes to jobs
* Note: there is a global node table (node_record_table_ptr)
*****************************************************************************
* Copyright (C) 2002 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* UCRL-CODE-2002-040.
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <syslog.h>
#include <unistd.h>
#include <slurm/slurm_errno.h>
#include "src/common/hostlist.h"
#include "src/common/node_select.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/sched_plugin.h"
#include "src/slurmctld/slurmctld.h"
#define BUF_SIZE 1024
#define MAX_RETRIES 10
struct node_set { /* set of nodes with same configuration */
uint32_t cpus_per_node; /* NOTE: This is the minimum count,
* if FastSchedule==0 then individual
* nodes within the same configuration
* line (in slurm.conf) can actually
* have different CPU counts */
int feature;
bitstr_t *my_bitmap;
static int _add_node_set_info(struct node_set *node_set_ptr,
bitstr_t ** node_bitmap,
int *node_cnt, int *cpu_cnt, int cr_enabled);
static int _build_node_list(struct job_record *job_ptr,
struct node_set **node_set_pptr,
int *node_set_size);
static void _filter_nodes_in_set(struct node_set *node_set_ptr,
struct job_details *detail_ptr);
static int _match_feature(char *seek, char *available);
static int _nodes_in_sets(bitstr_t *req_bitmap,
struct node_set * node_set_ptr,
int node_set_size);
static void _node_load_bitmaps(bitstr_t * bitmap, bitstr_t ** no_load_bit,
bitstr_t ** light_load_bit,
bitstr_t ** heavy_load_bit);
static int _pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap,
int min_nodes, int max_nodes);
static int _pick_best_nodes(struct node_set *node_set_ptr,
int node_set_size, bitstr_t ** select_bitmap,
struct job_record *job_ptr, uint32_t min_nodes,
uint32_t max_nodes, int shared,
uint32_t node_lim);
static int _valid_features(char *requested, char *available);
/*
* allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
* IN job_ptr - job being allocated resources
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
* last_node_update - last update time of node table
extern void allocate_nodes(struct job_record *job_ptr)
last_node_update = time(NULL);
for (i = 0; i < node_record_count; i++) {
if (bit_test(job_ptr->node_bitmap, i))
make_node_alloc(&node_record_table_ptr[i], job_ptr);
return;
}
* count_cpus - report how many cpus are associated with the identified nodes
* IN bitmap - map of nodes to tally
* RET cpu count
* globals: node_record_count - number of nodes configured
* node_record_table_ptr - pointer to global node table
extern int count_cpus(unsigned *bitmap)
int i, sum;
sum = 0;
for (i = 0; i < node_record_count; i++) {
if (bit_test(bitmap, i) != 1)
continue;
if (slurmctld_conf.fast_schedule)
sum += node_record_table_ptr[i].config_ptr->cpus;
else
sum += node_record_table_ptr[i].cpus;
return sum;
}
/*
* deallocate_nodes - for a given job, deallocate its nodes and make

Moe Jette
committed
* their state NODE_STATE_COMPLETING
* IN job_ptr - pointer to terminating job (already in some COMPLETING state)

Moe Jette
committed
* IN timeout - true of job exhausted time limit, send REQUEST_KILL_TIMELIMIT
* RPC instead of REQUEST_TERMINATE_JOB
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
*/
extern void deallocate_nodes(struct job_record *job_ptr, bool timeout)
int i;
agent_arg_t *agent_args;
int buf_rec_size = 0, down_node_cnt = 0;
uint16_t base_state, no_resp_flag;
xassert(job_ptr);
xassert(job_ptr->details);
if (select_g_job_fini(job_ptr) != SLURM_SUCCESS)
error("select_g_job_fini(%u): %m", job_ptr->job_id);
agent_args = xmalloc(sizeof(agent_arg_t));

Moe Jette
committed
if (timeout)
agent_args->msg_type = REQUEST_KILL_TIMELIMIT;
else
agent_args->msg_type = REQUEST_TERMINATE_JOB;
kill_job = xmalloc(sizeof(kill_job_msg_t));
last_node_update = time(NULL);
kill_job->job_id = job_ptr->job_id;
kill_job->job_uid = job_ptr->user_id;
kill_job->select_jobinfo = select_g_copy_jobinfo(
job_ptr->select_jobinfo);
for (i = 0; i < node_record_count; i++) {
struct node_record *node_ptr = &node_record_table_ptr[i];
if (bit_test(job_ptr->node_bitmap, i) == 0)
base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND);
no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
if (base_state == NODE_STATE_DOWN) {
/* Issue the KILL RPC, but don't verify response */
down_node_cnt++;
bit_clear(job_ptr->node_bitmap, i);
job_ptr->node_cnt--;
make_node_comp(node_ptr, job_ptr);
#ifdef HAVE_FRONT_END /* Operate only on front-end */
if (agent_args->node_count > 0)
continue;
#endif
if ((agent_args->node_count + 1) > buf_rec_size) {
buf_rec_size += 128;
xrealloc((agent_args->slurm_addr),
(sizeof(struct sockaddr_in) *
buf_rec_size));
xrealloc((agent_args->node_names),
(MAX_NAME_LEN * buf_rec_size));
agent_args->slurm_addr[agent_args->node_count] =
node_ptr->slurm_addr;
strncpy(&agent_args->
node_names[MAX_NAME_LEN * agent_args->node_count],
node_ptr->name, MAX_NAME_LEN);
agent_args->node_count++;
if ((agent_args->node_count - down_node_cnt) == 0)
job_ptr->job_state &= (~JOB_COMPLETING);
if (agent_args->node_count == 0) {
error("Job %u allocated no nodes to be killed on",
job_ptr->job_id);
xfree(kill_job);
xfree(agent_args);
return;
agent_queue_request(agent_args);
* _match_feature - determine if the desired feature is one of those available
* IN seek - desired feature
* IN available - comma separated list of available features
* RET 1 if found, 0 otherwise
static int _match_feature(char *seek, char *available)
char *tmp_available, *str_ptr3, *str_ptr4;
int found;
if (seek == NULL)
return 1; /* nothing to look for */
if (available == NULL)
return SLURM_SUCCESS; /* nothing to find */
tmp_available = xstrdup(available);
found = 0;
str_ptr3 = (char *) strtok_r(tmp_available, ",", &str_ptr4);
while (str_ptr3) {
if (strcmp(seek, str_ptr3) == 0) { /* we have a match */
found = 1;
break;
}
str_ptr3 = (char *) strtok_r(NULL, ",", &str_ptr4);
xfree(tmp_available);
return found;
}
/*
* _pick_best_load - Given a specification of scheduling requirements,
* identify the nodes which "best" satify the request.
* "best" is defined as the least loaded nodes
* IN job_ptr - pointer to job being scheduled
* IN/OUT bitmap - usable nodes are set on input, nodes not required to
* satisfy the request are cleared, other left set
* IN min_nodes - minimum count of nodes
* IN max_nodes - maximum count of nodes (0==don't care)
* RET zero on success, EINVAL otherwise
* globals: node_record_count - count of nodes configured
* node_record_table_ptr - pointer to global node table
* NOTE: bitmap must be a superset of req_nodes at the time that
* _pick_best_load is called
*/
static int
_pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap,
int min_nodes, int max_nodes)
{
bitstr_t *no_load_bit, *light_load_bit, *heavy_load_bit;
int error_code;
_node_load_bitmaps(bitmap, &no_load_bit, &light_load_bit,
&heavy_load_bit);
/* first try to use idle nodes */
bit_and(bitmap, no_load_bit);
FREE_NULL_BITMAP(no_load_bit);
/* always include required nodes or selection algorithm fails,
* note that we have already confirmed these nodes are available
* to this job */
if (job_ptr->details && job_ptr->details->req_node_bitmap)
bit_or(bitmap, job_ptr->details->req_node_bitmap);
error_code = select_g_job_test(job_ptr, bitmap,
min_nodes, max_nodes);
/* now try to use idle and lightly loaded nodes */
if (error_code) {
bit_or(bitmap, light_load_bit);
error_code = select_g_job_test(job_ptr, bitmap,
min_nodes, max_nodes);
}
FREE_NULL_BITMAP(light_load_bit);
/* now try to use all possible nodes */
if (error_code) {
bit_or(bitmap, heavy_load_bit);
error_code = select_g_job_test(job_ptr, bitmap,
min_nodes, max_nodes);
}
FREE_NULL_BITMAP(heavy_load_bit);
return error_code;
}
/*
* _node_load_bitmaps - given a bitmap of nodes, create three new bitmaps
* indicative of the load on those nodes
* IN bitmap - map of nodes to test
* OUT no_load_bitmap - nodes from bitmap with no jobs
* OUT light_load_bitmap - nodes from bitmap with one job
* OUT heavy_load_bitmap - nodes from bitmap with two or more jobs
* NOTE: caller must free the created bitmaps
*/
static void
_node_load_bitmaps(bitstr_t * bitmap, bitstr_t ** no_load_bit,
bitstr_t ** light_load_bit, bitstr_t ** heavy_load_bit)
{
int i, load;
bitoff_t size = bit_size(bitmap);
bitstr_t *bitmap0 = bit_alloc(size);
bitstr_t *bitmap1 = bit_alloc(size);
bitstr_t *bitmap2 = bit_alloc(size);
if ((bitmap0 == NULL) || (bitmap1 == NULL) || (bitmap2 == NULL))
fatal("bit_alloc malloc failure");
for (i = 0; i < size; i++) {
if (!bit_test(bitmap, i))
continue;
load = node_record_table_ptr[i].run_job_cnt;
if (load == 0)
bit_set(bitmap0, i);
else if (load == 1)
bit_set(bitmap1, i);
else
bit_set(bitmap2, i);
}
*no_load_bit = bitmap0;
*light_load_bit = bitmap1;
*heavy_load_bit = bitmap2;
}
* _pick_best_nodes - from a weigh order list of all nodes satisfying a
* job's specifications, select the "best" for use
* IN node_set_ptr - pointer to node specification information
* IN node_set_size - number of entries in records pointed to by node_set_ptr
* OUT select_bitmap - returns bitmap of selected nodes, must FREE_NULL_BITMAP
* IN job_ptr - pointer to job being scheduled
* IN min_nodes - minimum count of nodes required by the job
* IN max_nodes - maximum count of nodes required by the job (0==no limit)
* IN shared - set to 1 if nodes may be shared, 0 otherwise
* IN node_lim - maximum number of nodes permitted for job,
* INFINITE for no limit (partition limit)
* RET SLURM_SUCCESS on success,
* ESLURM_NODES_BUSY if request can not be satisfied now,
* ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE if request can never
* be satisfied , or
* ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE if the job can not be
* initiated until the parition's configuration changes
* NOTE: the caller must FREE_NULL_BITMAP memory pointed to by select_bitmap
* Notes: The algorithm is
* 1) If required node list is specified, determine implicitly required
* processor and node count
* 2) Determine how many disjoint required "features" are represented
* (e.g. "FS1|FS2|FS3")
* 3) For each feature: find matching node table entries, identify nodes
* that are up and available (idle or shared) and add them to a bit
* 4) If nodes _not_ shared then call select_g_job_test() to select the
* "best" of those based upon topology, else call _pick_best_load()
* to pick the "best" nodes in terms of workload
* 5) If request can't be satified now, execute select_g_job_test()
* against the list of nodes that exist in any state (perhaps DOWN
* DRAINED or ALLOCATED) to determine if the request can
* ever be satified.
static int
_pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
bitstr_t ** select_bitmap, struct job_record *job_ptr,
uint32_t min_nodes, uint32_t max_nodes,
int shared, uint32_t node_lim)
int error_code = SLURM_SUCCESS, i, j, pick_code;
int total_nodes = 0, total_cpus = 0; /* total resources configured
* in partition */
int avail_nodes = 0, avail_cpus = 0; /* resources available for
* use now */
bitstr_t *avail_bitmap = NULL, *total_bitmap = NULL;
bitstr_t *partially_idle_node_bitmap = NULL;
int max_feature, min_feature;
bool runable_ever = false; /* Job can ever run */
bool runable_avail = false; /* Job can run with available nodes */
int cr_enabled = 0;

Danny Auble
committed
#ifdef HAVE_BGL
uint16_t checked = 0;
#endif
if (node_set_size == 0) {
info("_pick_best_nodes: empty node set for selection");
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
/* Is Consumable Resources enabled? */
error_code = select_g_get_info_from_plugin (SELECT_CR_PLUGIN,
&cr_enabled);
if (error_code != SLURM_SUCCESS)
return error_code;
if (cr_enabled) {
shared = 0; /* No sharing when Consumable Resources is enabled */
job_ptr->cr_enabled = cr_enabled; /* CR enabled for this job */
debug3(" Is this Job %u in exclusive mode? %d cr_enabled %d",
job_ptr->job_id,
job_ptr->details->exclusive,
cr_enabled);
if (job_ptr->details->exclusive) {
partially_idle_node_bitmap = bit_copy(idle_node_bitmap);
} else {
/* Update partially_idle_node_bitmap to reflect the
idle and partially idle nodes */
error_code = select_g_get_info_from_plugin (SELECT_CR_BITMAP,
&partially_idle_node_bitmap);
}
if (error_code != SLURM_SUCCESS) {
FREE_NULL_BITMAP(partially_idle_node_bitmap);
return error_code;
}
}
if (job_ptr->details->req_node_bitmap) { /* specific nodes required */
/* we have already confirmed that all of these nodes have a
* usable configuration and are in the proper partition */
if (min_nodes != 0)
total_nodes = bit_set_count(
job_ptr->details->req_node_bitmap);
if (job_ptr->num_procs != 0) {
if (cr_enabled) {
error_code = select_g_get_extra_jobinfo (NULL,
job_ptr,
SELECT_CR_CPU_COUNT,
&total_cpus);
if (error_code != SLURM_SUCCESS) {
FREE_NULL_BITMAP(partially_idle_node_bitmap);
return error_code;
}
} else
total_cpus = count_cpus(
job_ptr->details->req_node_bitmap);
if ((max_nodes != 0) &&
(total_nodes > max_nodes)) {
info("_pick_best_nodes: required nodes exceed limit");
if (cr_enabled)
FREE_NULL_BITMAP(partially_idle_node_bitmap);
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
if ((node_lim != INFINITE) && (total_nodes > node_lim)) {
/* exceed partition node limit */
if (cr_enabled)
FREE_NULL_BITMAP(partially_idle_node_bitmap);
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
if ((min_nodes <= total_nodes) &&
(max_nodes <= min_nodes) &&
(job_ptr->num_procs <= total_cpus )) {
if (!bit_super_set(job_ptr->details->req_node_bitmap,
avail_node_bitmap)) {
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
if (shared) {
if (!bit_super_set(job_ptr->details->
req_node_bitmap,
share_node_bitmap)) {
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
return ESLURM_NODES_BUSY;
}
} else if (cr_enabled) {
if (!bit_super_set(job_ptr->details->
req_node_bitmap,
partially_idle_node_bitmap)) {
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
return ESLURM_NODES_BUSY;
if (!bit_super_set(job_ptr->details->
req_node_bitmap,
idle_node_bitmap)) {
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
return ESLURM_NODES_BUSY;
/* still must go through select_g_job_test() to
* determine validity of request and/or perform
* set-up before job launch */
total_nodes = total_cpus = 0; /* reinitialize */
/* identify how many feature sets we have (e.g. "[fs1|fs2|fs3|fs4]" */
max_feature = min_feature = node_set_ptr[0].feature;
for (i = 1; i < node_set_size; i++) {
if (node_set_ptr[i].feature > max_feature)
max_feature = node_set_ptr[i].feature;
if (node_set_ptr[i].feature < min_feature)
min_feature = node_set_ptr[i].feature;
for (j = min_feature; j <= max_feature; j++) {
for (i = 0; i < node_set_size; i++) {
bool pick_light_load = false;
if (node_set_ptr[i].feature != j)
continue;
if (!runable_ever) {
int cr_disabled = 0;
error_code = _add_node_set_info(
&node_set_ptr[i],
&total_bitmap,
&total_nodes, &total_cpus,
cr_disabled);
if (error_code != SLURM_SUCCESS) {
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
return error_code;
}
}
bit_and(node_set_ptr[i].my_bitmap, avail_node_bitmap);
if (shared) {
#ifdef HAVE_BGL
/* Exclude nodes which have jobs in COMPLETING
* state in order to insure Epilog completes
* before possibly scheduling another job to
* the same bglblock. */
int ni;
bit_and(node_set_ptr[i].my_bitmap,
share_node_bitmap);
for (ni = 0; ni < node_record_count; ni++) {
if (node_record_table_ptr[ni].node_state
== NODE_STATE_COMPLETING)
bit_clear(node_set_ptr[i].my_bitmap, ni);
}
/* pick_light_load = false; Non-overlapping blocks */
#else
bit_and(node_set_ptr[i].my_bitmap,
share_node_bitmap);
pick_light_load = true;
#endif
} else if (cr_enabled)
bit_and(node_set_ptr[i].my_bitmap,
partially_idle_node_bitmap);
else
bit_and(node_set_ptr[i].my_bitmap,
idle_node_bitmap);
node_set_ptr[i].nodes =
bit_set_count(node_set_ptr[i].my_bitmap);
error_code = _add_node_set_info(&node_set_ptr[i],
&avail_bitmap,
&avail_nodes,
&avail_cpus,
cr_enabled);
if (error_code != SLURM_SUCCESS) {
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
return error_code;
}
if ((job_ptr->details->req_node_bitmap) &&
(!bit_super_set(job_ptr->details->req_node_bitmap,
avail_bitmap)))
continue;
if ((avail_nodes < min_nodes) ||
((max_nodes > min_nodes) &&
(avail_nodes < max_nodes)))
continue; /* Keep accumulating nodes */
if (slurmctld_conf.fast_schedule
&& (avail_cpus < job_ptr->num_procs))
continue; /* Keep accumulating CPUs */
if (pick_light_load) {
pick_code = _pick_best_load(job_ptr,
avail_bitmap,
min_nodes,
max_nodes);
pick_code = select_g_job_test(job_ptr,
avail_bitmap,
min_nodes,
max_nodes);
if (pick_code == SLURM_SUCCESS) {
if ((node_lim != INFINITE) &&
(bit_set_count(avail_bitmap) > node_lim)) {
/* end of tests for this feature */
avail_nodes = 0;
break;
}
FREE_NULL_BITMAP(total_bitmap);
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
*select_bitmap = avail_bitmap;
/* try to get max_nodes now for this feature */
if ((max_nodes > min_nodes) &&
(avail_nodes >= min_nodes) &&
(avail_nodes < max_nodes)) {
pick_code = select_g_job_test(job_ptr, avail_bitmap,
min_nodes, max_nodes);
if ((pick_code == SLURM_SUCCESS) &&
((node_lim == INFINITE) ||
(bit_set_count(avail_bitmap) <= node_lim))) {
FREE_NULL_BITMAP(total_bitmap);
if (cr_enabled)
FREE_NULL_BITMAP(partially_idle_node_bitmap);
*select_bitmap = avail_bitmap;
return SLURM_SUCCESS;
}
}
/* determine if job could possibly run (if all configured
* nodes available) */
if ((!runable_ever || !runable_avail)
&& (total_nodes >= min_nodes)
&& ((slurmctld_conf.fast_schedule == 0) ||
(total_cpus >= job_ptr->num_procs))
&& ((job_ptr->details->req_node_bitmap == NULL) ||
(bit_super_set(job_ptr->details->req_node_bitmap,
total_bitmap)))) {
if (!runable_avail) {
FREE_NULL_BITMAP(avail_bitmap);
avail_bitmap = bit_copy(total_bitmap);
if (avail_bitmap == NULL)
fatal("bit_copy malloc failure");
bit_and(avail_bitmap, avail_node_bitmap);
if (cr_enabled)
job_ptr->cr_enabled = 0;
pick_code = select_g_job_test(job_ptr,
avail_bitmap,
min_nodes,
max_nodes);
if (cr_enabled)
job_ptr->cr_enabled = 1;
if (pick_code == SLURM_SUCCESS) {
runable_ever = true;
if ((node_lim == INFINITE) ||
(bit_set_count(avail_bitmap) <=
node_lim))
runable_avail = true;
if (!runable_ever) {
if (cr_enabled)
job_ptr->cr_enabled = 0;
pick_code = select_g_job_test(job_ptr,
total_bitmap,
min_nodes,
max_nodes);
if (cr_enabled)
job_ptr->cr_enabled = 1;
if (pick_code == SLURM_SUCCESS)
runable_ever = true;
FREE_NULL_BITMAP(avail_bitmap);
FREE_NULL_BITMAP(total_bitmap);
if (error_code != SLURM_SUCCESS)
if (cr_enabled)
FREE_NULL_BITMAP(partially_idle_node_bitmap);
/* The job is not able to start right now, return a
* value indicating when the job can start */
if (!runable_avail)
error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
if (!runable_ever) {
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
info("_pick_best_nodes: job never runnable");

Danny Auble
committed
#ifdef HAVE_BGL
select_g_set_jobinfo(job_ptr->select_jobinfo,
SELECT_DATA_CHECKED, &checked);
#endif
if (error_code == SLURM_SUCCESS)
error_code = ESLURM_NODES_BUSY;
return error_code;
}
* _add_node_set_info - add info in node_set_ptr to node_bitmap
* IN node_set_ptr - node set info
* IN/OUT node_bitmap - add nodes in set to this bitmap
* IN/OUT node_cnt - add count of nodes in set to this total
* IN/OUT cpu_cnt - add count of cpus in set to this total
* IN cr_enabled - specify if consumable resources (of processors) is enabled
_add_node_set_info(struct node_set *node_set_ptr,
bitstr_t ** node_bitmap,
int *node_cnt, int *cpu_cnt, int cr_enabled)
int error_code = SLURM_SUCCESS, i;
xassert(node_set_ptr->my_bitmap);
if (*node_bitmap)
bit_or(*node_bitmap, node_set_ptr->my_bitmap);
else {
*node_bitmap = bit_copy(node_set_ptr->my_bitmap);
if (*node_bitmap == NULL)
fatal("bit_copy malloc failure");
}
if (cr_enabled == 0) {
*node_cnt += node_set_ptr->nodes;
*cpu_cnt += node_set_ptr->nodes * node_set_ptr->cpus_per_node;
} else {
for (i = 0; i < node_record_count; i++) {
int allocated_cpus;
if (bit_test (node_set_ptr->my_bitmap, i) == 0)
allocated_cpus = 0;
error_code = select_g_get_select_nodeinfo(
&node_record_table_ptr[i],
SELECT_CR_USED_CPUS,
&allocated_cpus);
if (error_code != SLURM_SUCCESS) {
error(" cons_res: Invalid Node reference",
node_record_table_ptr[i]);
return error_code;
}

Moe Jette
committed
*node_cnt += 1;
*cpu_cnt += node_set_ptr->cpus_per_node -
allocated_cpus;

Moe Jette
committed
debug3(" cons_res: _add_node_set_info node_cnt %d cpu_cnt %d ",
*node_cnt, *cpu_cnt);
}
return error_code;
* select_nodes - select and allocate nodes to a specific job
* IN job_ptr - pointer to the job record
* IN test_only - if set do not allocate nodes, just confirm they
* could be allocated now
* RET 0 on success, ESLURM code from slurm_errno.h otherwise
* globals: list_part - global list of partition info
* default_part_loc - pointer to default partition
* config_list - global list of node configuration info
* Notes: The algorithm is
* 1) Build a table (node_set_ptr) of nodes with the requisite
* configuration. Each table entry includes their weight,
* node_list, features, etc.
* 2) Call _pick_best_nodes() to select those nodes best satisfying
* the request, (e.g. best-fit or other criterion)
* 3) Call allocate_nodes() to perform the actual allocation
extern int select_nodes(struct job_record *job_ptr, bool test_only)
int error_code = SLURM_SUCCESS, i, shared, node_set_size = 0;
bitstr_t *select_bitmap = NULL;
struct job_details *detail_ptr = job_ptr->details;
struct node_set *node_set_ptr = NULL;
struct part_record *part_ptr = job_ptr->part_ptr;
uint32_t min_nodes, max_nodes, part_node_limit;
int super_user = false;
enum job_wait_reason fail_reason;
xassert(job_ptr);
xassert(job_ptr->magic == JOB_MAGIC);
if ((job_ptr->user_id == 0) || (job_ptr->user_id == getuid()))
super_user = true;
/* identify partition */
if (part_ptr == NULL) {
part_ptr = find_part_record(job_ptr->partition);
xassert(part_ptr);
job_ptr->part_ptr = part_ptr;
error("partition pointer reset for job %u, part %s",
job_ptr->job_id, job_ptr->partition);
}
/* Confirm that partition is up and has compatible nodes limits */
fail_reason = WAIT_NO_REASON;
if (part_ptr->state_up == 0)
fail_reason = WAIT_PART_STATE;
else if (job_ptr->priority == 0) /* user or administrator hold */
fail_reason = WAIT_HELD;
else if (super_user)
; /* ignore any time or node count limits */
else if ((job_ptr->time_limit != NO_VAL) &&
(job_ptr->time_limit > part_ptr->max_time))
fail_reason = WAIT_PART_TIME_LIMIT;
else if (((job_ptr->details->max_nodes != 0) &&
(job_ptr->details->max_nodes < part_ptr->min_nodes)) ||
(job_ptr->details->min_nodes > part_ptr->max_nodes))
fail_reason = WAIT_PART_NODE_LIMIT;
if (fail_reason != WAIT_NO_REASON) {
if (detail_ptr)
detail_ptr->wait_reason = fail_reason;
if (job_ptr->priority != 0) /* not user/admin hold */
job_ptr->priority = 1; /* sys hold, move to end of queue */
last_job_update = time(NULL);
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
/* build sets of usable nodes based upon their configuration */
error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size);
if (error_code)
return error_code;
/* insure that selected nodes in these node sets */
if (job_ptr->details->req_node_bitmap) {
error_code = _nodes_in_sets(job_ptr->details->req_node_bitmap,
node_set_ptr, node_set_size);
if (error_code) {
info("No nodes satify requirements for JobId=%u",
job_ptr->job_id);
goto cleanup;
/* enforce both user's and partition's node limits */
if (super_user) {
min_nodes = job_ptr->details->min_nodes;
part_node_limit = INFINITE;
} else {
min_nodes = MAX(job_ptr->details->min_nodes,
part_ptr->min_nodes);
part_node_limit = part_ptr->max_nodes;
}
if (super_user || (job_ptr->details->max_nodes == 0) ||
(part_ptr->max_nodes == INFINITE))
max_nodes = job_ptr->details->max_nodes;
else
max_nodes = MIN(job_ptr->details->max_nodes,
part_ptr->max_nodes);
if (part_ptr->shared == SHARED_FORCE) /* shared=force */
shared = 1;
else if (part_ptr->shared == SHARED_NO) /* can't share */
shared = 0;
else
shared = job_ptr->details->shared;
error_code = _pick_best_nodes(node_set_ptr, node_set_size,
&select_bitmap, job_ptr,
min_nodes, max_nodes,
shared, part_node_limit);
if (error_code) {
if (detail_ptr)
detail_ptr->wait_reason = WAIT_RESOUCES;
if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
/* Required nodes are down or
* too many nodes requested */
debug3("JobId=%u not runnable with present config",
job_ptr->job_id);
job_ptr->priority = 1; /* Move to end of queue */
last_job_update = time(NULL);
} else if (error_code == ESLURM_NODES_BUSY)
slurm_sched_job_is_pending();
goto cleanup;
if (test_only) { /* set if job not highest priority */
slurm_sched_job_is_pending();
if (select_g_job_begin(job_ptr) != SLURM_SUCCESS) {
/* Leave job queued, something is hosed */
error("select_g_job_begin(%u): %m", job_ptr->job_id);
error_code = ESLURM_NODES_BUSY;
goto cleanup;
}
/* assign the nodes and stage_in the job */
if (detail_ptr)
detail_ptr->wait_reason = WAIT_NO_REASON;
job_ptr->nodes = bitmap2node_name(select_bitmap);
job_ptr->node_bitmap = select_bitmap;
job_ptr->details->shared = shared;
select_bitmap = NULL; /* nothing left to free */
allocate_nodes(job_ptr);
build_node_details(job_ptr);
job_ptr->job_state = JOB_RUNNING;
job_ptr->start_time = job_ptr->time_last_active = time(NULL);
if (job_ptr->time_limit == NO_VAL)
job_ptr->time_limit = part_ptr->max_time;
if (job_ptr->time_limit == INFINITE)
job_ptr->end_time = job_ptr->start_time +
(365 * 24 * 60 * 60); /* secs in year */
job_ptr->end_time = job_ptr->start_time +
(job_ptr->time_limit * 60); /* secs */
if (job_ptr->mail_type & MAIL_JOB_BEGIN)
mail_job_info(job_ptr, MAIL_JOB_BEGIN);
cleanup:
FREE_NULL_BITMAP(select_bitmap);
if (node_set_ptr) {
for (i = 0; i < node_set_size; i++)
FREE_NULL_BITMAP(node_set_ptr[i].my_bitmap);
xfree(node_set_ptr);
}
return error_code;
}
/*
* _build_node_list - identify which nodes could be allocated to a job
* IN job_ptr - pointer to node to be scheduled
* OUT node_set_pptr - list of node sets which could be used for the job
* OUT node_set_size - number of node_set entries
* RET error code
*/
static int _build_node_list(struct job_record *job_ptr,
struct node_set **node_set_pptr,
int *node_set_size)
{
int node_set_inx;
struct node_set *node_set_ptr;
struct config_record *config_ptr;
struct part_record *part_ptr = job_ptr->part_ptr;
ListIterator config_iterator;
int tmp_feature, check_node_config, config_filter = 0;
struct job_details *detail_ptr = job_ptr->details;
bitstr_t *exc_node_mask = NULL;
node_set_inx = 0;
node_set_ptr = (struct node_set *)
xmalloc(sizeof(struct node_set) * 2);
node_set_ptr[node_set_inx+1].my_bitmap = NULL;
if (detail_ptr->exc_node_bitmap) {
exc_node_mask = bit_copy(detail_ptr->exc_node_bitmap);
if (exc_node_mask == NULL)
fatal("bit_copy malloc failure");
bit_not(exc_node_mask);
}
config_iterator = list_iterator_create(config_list);
if (config_iterator == NULL)
fatal("list_iterator_create malloc failure");
while ((config_ptr = (struct config_record *)
list_next(config_iterator))) {
tmp_feature = _valid_features(job_ptr->details->features,
config_ptr->feature);
if (tmp_feature == 0)
continue;
if ((detail_ptr->min_procs > config_ptr->cpus ) ||
(detail_ptr->min_memory > config_ptr->real_memory) ||
(detail_ptr->min_tmp_disk > config_ptr->tmp_disk))
config_filter = 1;
else
config_filter = 0;
/* since nodes can register with more resources than defined */
/* in the configuration, we want to use those higher values */
/* for scheduling, but only as needed (slower) */
if (slurmctld_conf.fast_schedule) {
if (config_filter)
continue;
check_node_config = 0;
} else if (config_filter) {