Newer
Older
/*****************************************************************************\
* node_scheduler.c - select and allocated nodes to jobs
* Note: there is a global node table (node_record_table_ptr)
*****************************************************************************
* Copyright (C) 2002-2006 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <syslog.h>
#include <unistd.h>
#include <slurm/slurm_errno.h>
#include "src/common/hostlist.h"
#include "src/common/node_select.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/sched_plugin.h"
#include "src/slurmctld/slurmctld.h"
#define MAX_RETRIES 10
struct node_set { /* set of nodes with same configuration */
uint32_t cpus_per_node; /* NOTE: This is the minimum count,
* if FastSchedule==0 then individual
* nodes within the same configuration
* line (in slurm.conf) can actually
* have different CPU counts */
int feature;
bitstr_t *my_bitmap;
static int _add_node_set_info(struct node_set *node_set_ptr,
bitstr_t ** node_bitmap,
int *node_cnt, int *cpu_cnt, int cr_enabled);
static int _build_node_list(struct job_record *job_ptr,
struct node_set **node_set_pptr,
int *node_set_size);
static void _filter_nodes_in_set(struct node_set *node_set_ptr,
struct job_details *detail_ptr);
static int _match_feature(char *seek, char *available);
static int _nodes_in_sets(bitstr_t *req_bitmap,
struct node_set * node_set_ptr,
int node_set_size);
static void _node_load_bitmaps(bitstr_t * bitmap, bitstr_t ** no_load_bit,
bitstr_t ** light_load_bit,
bitstr_t ** heavy_load_bit);
static int _pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, bool test_only);
static int _pick_best_nodes(struct node_set *node_set_ptr,
int node_set_size, bitstr_t ** select_bitmap,
struct job_record *job_ptr,
struct part_record *part_ptr,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes);
static int _valid_features(char *requested, char *available);
/*
* allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
* IN job_ptr - job being allocated resources
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
* last_node_update - last update time of node table
extern void allocate_nodes(struct job_record *job_ptr)
last_node_update = time(NULL);
for (i = 0; i < node_record_count; i++) {
if (bit_test(job_ptr->node_bitmap, i))
make_node_alloc(&node_record_table_ptr[i], job_ptr);
return;
}
* count_cpus - report how many cpus are associated with the identified nodes
* IN bitmap - map of nodes to tally
* RET cpu count
* globals: node_record_count - number of nodes configured
* node_record_table_ptr - pointer to global node table
extern int count_cpus(unsigned *bitmap)
int i, sum;
sum = 0;
for (i = 0; i < node_record_count; i++) {
if (bit_test(bitmap, i) != 1)
continue;
if (slurmctld_conf.fast_schedule)
sum += node_record_table_ptr[i].config_ptr->cpus;
else
sum += node_record_table_ptr[i].cpus;
return sum;
}
/*
* deallocate_nodes - for a given job, deallocate its nodes and make

Moe Jette
committed
* their state NODE_STATE_COMPLETING
* IN job_ptr - pointer to terminating job (already in some COMPLETING state)
* IN timeout - true if job exhausted time limit, send REQUEST_KILL_TIMELIMIT
* RPC instead of REQUEST_TERMINATE_JOB
* IN suspended - true if job was already suspended (node's job_run_cnt
* already decremented);
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
*/
extern void deallocate_nodes(struct job_record *job_ptr, bool timeout,
bool suspended)
int i;
kill_job_msg_t *kill_job = NULL;
agent_arg_t *agent_args = NULL;
int down_node_cnt = 0;
uint16_t base_state;
xassert(job_ptr);
xassert(job_ptr->details);
if (select_g_job_fini(job_ptr) != SLURM_SUCCESS)
error("select_g_job_fini(%u): %m", job_ptr->job_id);
agent_args = xmalloc(sizeof(agent_arg_t));

Moe Jette
committed
if (timeout)
agent_args->msg_type = REQUEST_KILL_TIMELIMIT;
else
agent_args->msg_type = REQUEST_TERMINATE_JOB;
kill_job = xmalloc(sizeof(kill_job_msg_t));
last_node_update = time(NULL);
kill_job->job_id = job_ptr->job_id;
kill_job->job_uid = job_ptr->user_id;
kill_job->nodes = xstrdup(job_ptr->nodes);
kill_job->time = time(NULL);
kill_job->select_jobinfo = select_g_copy_jobinfo(
job_ptr->select_jobinfo);
for (i = 0; i < node_record_count; i++) {
struct node_record *node_ptr = &node_record_table_ptr[i];
if (bit_test(job_ptr->node_bitmap, i) == 0)
base_state = node_ptr->node_state & NODE_STATE_BASE;
if (base_state == NODE_STATE_DOWN) {
/* Issue the KILL RPC, but don't verify response */
down_node_cnt++;
bit_clear(job_ptr->node_bitmap, i);
job_ptr->node_cnt--;
make_node_comp(node_ptr, job_ptr, suspended);
#ifdef HAVE_FRONT_END /* Operate only on front-end */
if (agent_args->node_count > 0)
continue;
#endif
hostlist_push(agent_args->hostlist, node_ptr->name);
agent_args->node_count++;
if ((agent_args->node_count - down_node_cnt) == 0)
job_ptr->job_state &= (~JOB_COMPLETING);
if (agent_args->node_count == 0) {
error("Job %u allocated no nodes to be killed on",
job_ptr->job_id);
xfree(kill_job->nodes);
select_g_free_jobinfo(&kill_job->select_jobinfo);
xfree(kill_job);
xfree(agent_args);
return;
agent_queue_request(agent_args);
* _match_feature - determine if the desired feature is one of those available
* IN seek - desired feature
* IN available - comma separated list of available features
* RET 1 if found, 0 otherwise
static int _match_feature(char *seek, char *available)
char *tmp_available, *str_ptr3, *str_ptr4;
int found;
if (seek == NULL)
return 1; /* nothing to look for */
if (available == NULL)
return SLURM_SUCCESS; /* nothing to find */
tmp_available = xstrdup(available);
found = 0;
str_ptr3 = (char *) strtok_r(tmp_available, ",", &str_ptr4);
while (str_ptr3) {
if (strcmp(seek, str_ptr3) == 0) { /* we have a match */
found = 1;
break;
}
str_ptr3 = (char *) strtok_r(NULL, ",", &str_ptr4);
xfree(tmp_available);
return found;
}
/*
* _pick_best_load - Given a specification of scheduling requirements,
* identify the nodes which "best" satisfy the request.
* "best" is defined as the least loaded nodes
* IN job_ptr - pointer to job being scheduled
* IN/OUT bitmap - usable nodes are set on input, nodes not required to
* satisfy the request are cleared, other left set
* IN min_nodes - minimum count of nodes
* IN max_nodes - maximum count of nodes (0==don't care)
* IN req_nodes - requested (or desired) count of nodes
* RET zero on success, EINVAL otherwise
* globals: node_record_count - count of nodes configured
* node_record_table_ptr - pointer to global node table
* NOTE: bitmap must be a superset of req_nodes at the time that
* _pick_best_load is called
*/
static int
_pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, bool test_only)
{
bitstr_t *no_load_bit, *light_load_bit, *heavy_load_bit;
int error_code;
_node_load_bitmaps(bitmap, &no_load_bit, &light_load_bit,
&heavy_load_bit);
/* first try to use idle nodes */
bit_and(bitmap, no_load_bit);
FREE_NULL_BITMAP(no_load_bit);
/* always include required nodes or selection algorithm fails,
* note that we have already confirmed these nodes are available
* to this job */
if (job_ptr->details && job_ptr->details->req_node_bitmap)
bit_or(bitmap, job_ptr->details->req_node_bitmap);
error_code = select_g_job_test(job_ptr, bitmap,
req_nodes, test_only);
/* now try to use idle and lightly loaded nodes */
if (error_code) {
bit_or(bitmap, light_load_bit);
error_code = select_g_job_test(job_ptr, bitmap,
req_nodes, test_only);
}
FREE_NULL_BITMAP(light_load_bit);
/* now try to use all possible nodes */
if (error_code) {
bit_or(bitmap, heavy_load_bit);
error_code = select_g_job_test(job_ptr, bitmap,
req_nodes, test_only);
}
FREE_NULL_BITMAP(heavy_load_bit);
return error_code;
}
/*
* _node_load_bitmaps - given a bitmap of nodes, create three new bitmaps
* indicative of the load on those nodes
* IN bitmap - map of nodes to test
* OUT no_load_bitmap - nodes from bitmap with no jobs
* OUT light_load_bitmap - nodes from bitmap with one job
* OUT heavy_load_bitmap - nodes from bitmap with two or more jobs
* NOTE: caller must free the created bitmaps
*/
static void
_node_load_bitmaps(bitstr_t * bitmap, bitstr_t ** no_load_bit,
bitstr_t ** light_load_bit, bitstr_t ** heavy_load_bit)
{
int i, load;
bitoff_t size = bit_size(bitmap);
bitstr_t *bitmap0 = bit_alloc(size);
bitstr_t *bitmap1 = bit_alloc(size);
bitstr_t *bitmap2 = bit_alloc(size);
if ((bitmap0 == NULL) || (bitmap1 == NULL) || (bitmap2 == NULL))
fatal("bit_alloc malloc failure");
for (i = 0; i < size; i++) {
if (!bit_test(bitmap, i))
continue;
load = node_record_table_ptr[i].run_job_cnt;
if (load == 0)
bit_set(bitmap0, i);
else if (load == 1)
bit_set(bitmap1, i);
else
bit_set(bitmap2, i);
}
*no_load_bit = bitmap0;
*light_load_bit = bitmap1;
*heavy_load_bit = bitmap2;
}
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
/*
* Decide if a job can share nodes with other jobs based on the
* following three input parameters:
*
* IN user_flag - may be 0 (do not share nodes), 1 (node sharing allowed),
* or any other number means "don't care"
* IN part_enum - current partition's node sharing policy
* IN cons_res_flag - 1 if the consumable resources flag is enable, 0 otherwise
*
* RET - 1 if nodes can be shared, 0 if nodes cannot be shared
*/
static int
_resolve_shared_status(uint16_t user_flag, uint16_t part_enum,
int cons_res_flag)
{
int shared;
if (cons_res_flag) {
/*
* Consuable resources will always share nodes by default,
* the user has to explicitly disable sharing to
* get exclusive nodes.
*/
shared = user_flag == 0 ? 0 : 1;
} else {
/* The partition sharing option is only used if
* the consumable resources plugin is NOT in use.
*/
if (part_enum == SHARED_FORCE) /* shared=force */
shared = 1;
else if (part_enum == SHARED_NO) /* can't share */
shared = 0;
else
shared = user_flag == 1 ? 1 : 0;
}
return shared;
}
* _pick_best_nodes - from a weigh order list of all nodes satisfying a
* job's specifications, select the "best" for use
* IN node_set_ptr - pointer to node specification information
* IN node_set_size - number of entries in records pointed to by node_set_ptr
* OUT select_bitmap - returns bitmap of selected nodes, must FREE_NULL_BITMAP
* IN job_ptr - pointer to job being scheduled
* IN part_ptr - pointer to the partition in which the job is being scheduled
* IN min_nodes - minimum count of nodes required by the job
* IN max_nodes - maximum count of nodes required by the job (0==no limit)
* IN req_nodes - requested (or desired) count of nodes
* RET SLURM_SUCCESS on success,
* ESLURM_NODES_BUSY if request can not be satisfied now,
* ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE if request can never
* be satisfied , or
* ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE if the job can not be
* initiated until the parition's configuration changes
* NOTE: the caller must FREE_NULL_BITMAP memory pointed to by select_bitmap
* Notes: The algorithm is
* 1) If required node list is specified, determine implicitly required
* processor and node count
* 2) Determine how many disjoint required "features" are represented
* (e.g. "FS1|FS2|FS3")
* 3) For each feature: find matching node table entries, identify nodes
* that are up and available (idle or shared) and add them to a bit
* 4) If nodes _not_ shared then call select_g_job_test() to select the
* "best" of those based upon topology, else call _pick_best_load()
* to pick the "best" nodes in terms of workload
* 5) If request can't be satisfied now, execute select_g_job_test()
* against the list of nodes that exist in any state (perhaps DOWN
* DRAINED or ALLOCATED) to determine if the request can
* ever be satified.
static int
_pick_best_nodes(struct node_set *node_set_ptr, int node_set_size,
bitstr_t ** select_bitmap, struct job_record *job_ptr,
struct part_record *part_ptr,
uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes)
int error_code = SLURM_SUCCESS, i, j, pick_code;
int total_nodes = 0, total_cpus = 0; /* total resources configured
* in partition */
int avail_nodes = 0, avail_cpus = 0; /* resources available for
* use now */
bitstr_t *avail_bitmap = NULL, *total_bitmap = NULL;
bitstr_t *partially_idle_node_bitmap = NULL, *possible_bitmap = NULL;
int max_feature, min_feature;
bool runable_ever = false; /* Job can ever run */
bool runable_avail = false; /* Job can run with available nodes */
int cr_enabled = 0;
int shared = 0;
if (node_set_size == 0) {
info("_pick_best_nodes: empty node set for selection");
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
/* Is Consumable Resources enabled? */
error_code = select_g_get_info_from_plugin (SELECT_CR_PLUGIN,
&cr_enabled);
if (error_code != SLURM_SUCCESS)
return error_code;
shared = _resolve_shared_status(job_ptr->details->shared,
part_ptr->shared, cr_enabled);
job_ptr->details->shared = shared;
if (cr_enabled) {
job_ptr->cr_enabled = cr_enabled; /* CR enabled for this job */
debug3(" Is this Job %u in exclusive mode? %d cr_enabled %d",
job_ptr->job_id, shared, cr_enabled);
if (shared == 0) {
partially_idle_node_bitmap = bit_copy(idle_node_bitmap);
} else {
/* Update partially_idle_node_bitmap to reflect the
* idle and partially idle nodes */
error_code = select_g_get_info_from_plugin (
SELECT_CR_BITMAP,
&partially_idle_node_bitmap);
}
if (error_code != SLURM_SUCCESS) {
FREE_NULL_BITMAP(partially_idle_node_bitmap);
return error_code;
}
}
if (job_ptr->details->req_node_bitmap) { /* specific nodes required */
/* we have already confirmed that all of these nodes have a
* usable configuration and are in the proper partition */
if (min_nodes != 0)
total_nodes = bit_set_count(
job_ptr->details->req_node_bitmap);
if (job_ptr->num_procs != 0) {
error_code = select_g_get_extra_jobinfo (
NULL,
job_ptr,
SELECT_CR_CPU_COUNT,
&total_cpus);
if (error_code != SLURM_SUCCESS) {
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
total_cpus = count_cpus(
job_ptr->details->req_node_bitmap);
if (total_nodes > max_nodes) {
/* exceeds node limit */
if (cr_enabled)
FREE_NULL_BITMAP(partially_idle_node_bitmap);
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
if ((min_nodes <= total_nodes) &&
(max_nodes <= min_nodes) &&
(job_ptr->num_procs <= total_cpus )) {
if (!bit_super_set(job_ptr->details->req_node_bitmap,
avail_node_bitmap)) {
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
if (cr_enabled) {
if (!bit_super_set(job_ptr->details->
req_node_bitmap,
partially_idle_node_bitmap)) {
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
return ESLURM_NODES_BUSY;
}
} else if (shared) {
if (!bit_super_set(job_ptr->details->
req_node_bitmap,
share_node_bitmap)) {
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
return ESLURM_NODES_BUSY;
if (!bit_super_set(job_ptr->details->
req_node_bitmap,
idle_node_bitmap)) {
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
return ESLURM_NODES_BUSY;
/* still must go through select_g_job_test() to
* determine validity of request and/or perform
* set-up before job launch */
total_nodes = total_cpus = 0; /* reinitialize */
/* identify how many feature sets we have (e.g. "[fs1|fs2|fs3|fs4]" */
max_feature = min_feature = node_set_ptr[0].feature;
for (i = 1; i < node_set_size; i++) {
if (node_set_ptr[i].feature > max_feature)
max_feature = node_set_ptr[i].feature;
if (node_set_ptr[i].feature < min_feature)
min_feature = node_set_ptr[i].feature;
for (j = min_feature; j <= max_feature; j++) {
/* we use this var to go straight down the list if the
* first one doesn't work we go to the next until the
* list is empty.
*/
int tries = 0;
for (i = 0; i < node_set_size; i++) {
bool pick_light_load = false;
if (node_set_ptr[i].feature != j)
continue;
if (!runable_ever) {
int cr_disabled = 0;
error_code = _add_node_set_info(
&node_set_ptr[i],
&total_bitmap,
&total_nodes, &total_cpus,
cr_disabled);
if (error_code != SLURM_SUCCESS) {
if (cr_enabled) {
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
}
FREE_NULL_BITMAP(total_bitmap);
FREE_NULL_BITMAP(possible_bitmap);
return error_code;
bit_and(node_set_ptr[i].my_bitmap, avail_node_bitmap);
if (cr_enabled) {
bit_and(node_set_ptr[i].my_bitmap,
partially_idle_node_bitmap);
} else if (shared) {
/* If any nodes which can be used have jobs in
* COMPLETING state then do not schedule the
* job, this give time to insure Epilog
* completes before possibly scheduling another
* job to the same bgblock. We also want to
* route the job to the smallest usable block*/
int ni;
bit_and(node_set_ptr[i].my_bitmap,
share_node_bitmap);
for (ni = 0; ni < node_record_count; ni++) {
node_state & NODE_STATE_COMPLETING)
continue;
}
#else
bit_and(node_set_ptr[i].my_bitmap,
share_node_bitmap);
pick_light_load = true;
#endif
} else {
bit_and(node_set_ptr[i].my_bitmap,
idle_node_bitmap);
}
node_set_ptr[i].nodes =
bit_set_count(node_set_ptr[i].my_bitmap);
error_code = _add_node_set_info(&node_set_ptr[i],
&avail_bitmap,
&avail_nodes,
&avail_cpus,
cr_enabled);
if (error_code != SLURM_SUCCESS) {
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
FREE_NULL_BITMAP(total_bitmap);
FREE_NULL_BITMAP(avail_bitmap);
FREE_NULL_BITMAP(possible_bitmap);
return error_code;
if ((job_ptr->details->req_node_bitmap) &&
(!bit_super_set(job_ptr->details->req_node_bitmap,
avail_bitmap)))
continue;
if ((avail_nodes < min_nodes) ||
((req_nodes > min_nodes) &&
(avail_nodes < req_nodes)))
continue; /* Keep accumulating nodes */
if (slurmctld_conf.fast_schedule
&& (avail_cpus < job_ptr->num_procs))
continue; /* Keep accumulating CPUs */
if (pick_light_load) {
pick_code = _pick_best_load(job_ptr,
avail_bitmap,
min_nodes,
pick_code = select_g_job_test(job_ptr,
avail_bitmap,
min_nodes,
if (pick_code == SLURM_SUCCESS) {
if (bit_set_count(avail_bitmap) > max_nodes) {
/* end of tests for this feature */
avail_nodes = 0;
break;
}
FREE_NULL_BITMAP(total_bitmap);
FREE_NULL_BITMAP(possible_bitmap);
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
*select_bitmap = avail_bitmap;
} else {
/* reset the counters and start from the
* next node in the list */
FREE_NULL_BITMAP(avail_bitmap);
avail_nodes = 0;
avail_cpus = 0;
tries++;
i = tries;
/* try to get req_nodes now for this feature */
if ((req_nodes > min_nodes) &&
(avail_nodes >= min_nodes) &&
(avail_nodes < req_nodes)) {
pick_code = select_g_job_test(job_ptr, avail_bitmap,
if ((pick_code == SLURM_SUCCESS) &&
(bit_set_count(avail_bitmap) <= max_nodes)) {
FREE_NULL_BITMAP(total_bitmap);
FREE_NULL_BITMAP(possible_bitmap);
if (cr_enabled)
FREE_NULL_BITMAP(
partially_idle_node_bitmap);
*select_bitmap = avail_bitmap;
return SLURM_SUCCESS;
}
}
/* determine if job could possibly run (if all configured
* nodes available) */
if ((!runable_ever || !runable_avail)
&& (total_nodes >= min_nodes)
&& ((slurmctld_conf.fast_schedule == 0) ||
(total_cpus >= job_ptr->num_procs))
&& ((job_ptr->details->req_node_bitmap == NULL) ||
(bit_super_set(job_ptr->details->req_node_bitmap,
total_bitmap)))) {
if (!runable_avail) {
FREE_NULL_BITMAP(avail_bitmap);
avail_bitmap = bit_copy(total_bitmap);
if (avail_bitmap == NULL)
fatal("bit_copy malloc failure");
bit_and(avail_bitmap, avail_node_bitmap);
pick_code = select_g_job_test(job_ptr,
avail_bitmap,
min_nodes,
true);
if (cr_enabled)
job_ptr->cr_enabled = 1;
if (pick_code == SLURM_SUCCESS) {
runable_ever = true;
if (bit_set_count(avail_bitmap) <=
max_nodes)
runable_avail = true;
possible_bitmap = avail_bitmap;
avail_bitmap = NULL;
if (!runable_ever) {
pick_code = select_g_job_test(job_ptr,
total_bitmap,
min_nodes,
true);
if (cr_enabled)
job_ptr->cr_enabled = 1;
if (pick_code == SLURM_SUCCESS) {
possible_bitmap = total_bitmap;
total_bitmap = NULL;
runable_ever = true;
FREE_NULL_BITMAP(avail_bitmap);
FREE_NULL_BITMAP(total_bitmap);
if (error_code != SLURM_SUCCESS)
if (cr_enabled)
FREE_NULL_BITMAP(partially_idle_node_bitmap);
/* The job is not able to start right now, return a
* value indicating when the job can start */
if (!runable_avail)
error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
if (!runable_ever) {
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
info("_pick_best_nodes: job never runnable");
if (error_code == SLURM_SUCCESS) {
error_code = ESLURM_NODES_BUSY;
*select_bitmap = possible_bitmap;
}
return error_code;
}
* _add_node_set_info - add info in node_set_ptr to node_bitmap
* IN node_set_ptr - node set info
* IN/OUT node_bitmap - add nodes in set to this bitmap
* IN/OUT node_cnt - add count of nodes in set to this total
* IN/OUT cpu_cnt - add count of cpus in set to this total
* IN cr_enabled - specify if consumable resources (of processors) is enabled
_add_node_set_info(struct node_set *node_set_ptr,
bitstr_t ** node_bitmap,
int *node_cnt, int *cpu_cnt, int cr_enabled)
int error_code = SLURM_SUCCESS, i;
xassert(node_set_ptr->my_bitmap);
if (*node_bitmap)
bit_or(*node_bitmap, node_set_ptr->my_bitmap);
else {
*node_bitmap = bit_copy(node_set_ptr->my_bitmap);
if (*node_bitmap == NULL)
fatal("bit_copy malloc failure");
}
if (cr_enabled == 0) {
*node_cnt += node_set_ptr->nodes;
*cpu_cnt += node_set_ptr->nodes * node_set_ptr->cpus_per_node;
} else {
for (i = 0; i < node_record_count; i++) {
int allocated_cpus;
if (bit_test (node_set_ptr->my_bitmap, i) == 0)
allocated_cpus = 0;
error_code = select_g_get_select_nodeinfo(
&node_record_table_ptr[i],
SELECT_CR_USED_CPUS,
&allocated_cpus);
if (error_code != SLURM_SUCCESS) {
error(" cons_res: Invalid Node reference",
node_record_table_ptr[i]);
return error_code;
}

Moe Jette
committed
*node_cnt += 1;
*cpu_cnt += node_set_ptr->cpus_per_node -
allocated_cpus;

Moe Jette
committed
debug3(" cons_res: _add_node_set_info node_cnt %d cpu_cnt %d ",
*node_cnt, *cpu_cnt);
}
return error_code;
* select_nodes - select and allocate nodes to a specific job
* IN job_ptr - pointer to the job record
* IN test_only - if set do not allocate nodes, just confirm they
* could be allocated now
* IN select_node_bitmap - bitmap of nodes to be used for the
* job's resource allocation (not returned if NULL), caller
* must free
* RET 0 on success, ESLURM code from slurm_errno.h otherwise
* globals: list_part - global list of partition info
* default_part_loc - pointer to default partition
* config_list - global list of node configuration info
* Notes: The algorithm is
* 1) Build a table (node_set_ptr) of nodes with the requisite
* configuration. Each table entry includes their weight,
* node_list, features, etc.
* 2) Call _pick_best_nodes() to select those nodes best satisfying
* the request, (e.g. best-fit or other criterion)
* 3) Call allocate_nodes() to perform the actual allocation
extern int select_nodes(struct job_record *job_ptr, bool test_only,
bitstr_t **select_node_bitmap)
int error_code = SLURM_SUCCESS, i, node_set_size = 0;
bitstr_t *select_bitmap = NULL;
struct job_details *detail_ptr = job_ptr->details;
struct node_set *node_set_ptr = NULL;
struct part_record *part_ptr = job_ptr->part_ptr;
uint32_t min_nodes, max_nodes, req_nodes;
int super_user = false;
enum job_wait_reason fail_reason;
xassert(job_ptr);
xassert(job_ptr->magic == JOB_MAGIC);
if ((job_ptr->user_id == 0) || (job_ptr->user_id == getuid()))
super_user = true;
/* identify partition */
if (part_ptr == NULL) {
part_ptr = find_part_record(job_ptr->partition);
xassert(part_ptr);
job_ptr->part_ptr = part_ptr;
error("partition pointer reset for job %u, part %s",
job_ptr->job_id, job_ptr->partition);
}
/* Confirm that partition is up and has compatible nodes limits */
fail_reason = WAIT_NO_REASON;
if (part_ptr->state_up == 0)
fail_reason = WAIT_PART_STATE;
else if (job_ptr->priority == 0) /* user or administrator hold */
fail_reason = WAIT_HELD;
else if (super_user)
; /* ignore any time or node count limits */
else if ((job_ptr->time_limit != NO_VAL) &&
(job_ptr->time_limit > part_ptr->max_time))
fail_reason = WAIT_PART_TIME_LIMIT;
else if (((job_ptr->details->max_nodes != 0) &&
(job_ptr->details->max_nodes < part_ptr->min_nodes)) ||
(job_ptr->details->min_nodes > part_ptr->max_nodes))
fail_reason = WAIT_PART_NODE_LIMIT;
if (fail_reason != WAIT_NO_REASON) {
if (detail_ptr)
detail_ptr->wait_reason = fail_reason;
last_job_update = time(NULL);
if (job_ptr->priority == 0) /* user/admin hold */
return ESLURM_JOB_HELD;
job_ptr->priority = 1; /* sys hold, move to end of queue */
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
/* build sets of usable nodes based upon their configuration */
error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size);
if (error_code)
return error_code;
/* insure that selected nodes in these node sets */
if (job_ptr->details->req_node_bitmap) {
error_code = _nodes_in_sets(job_ptr->details->req_node_bitmap,
node_set_ptr, node_set_size);
if (error_code) {
info("No nodes satisfy requirements for JobId=%u",
job_ptr->job_id);
goto cleanup;
/* enforce both user's and partition's node limits */
/* info("req: %u-%u, %u", job_ptr->details->min_nodes,
job_ptr->details->max_nodes, part_ptr->max_nodes); */
if (super_user) {
min_nodes = job_ptr->details->min_nodes;
} else {
min_nodes = MAX(job_ptr->details->min_nodes,
part_ptr->min_nodes);
}
if (job_ptr->details->max_nodes == 0) {
if (super_user)
max_nodes = INFINITE;
else
max_nodes = part_ptr->max_nodes;
} else if (super_user)
max_nodes = job_ptr->details->max_nodes;
else
max_nodes = MIN(job_ptr->details->max_nodes,
part_ptr->max_nodes);
max_nodes = MIN(max_nodes, 500000); /* prevent overflows */
req_nodes = max_nodes;
else
req_nodes = min_nodes;
/* info("nodes:%u:%u:%u", min_nodes, req_nodes, max_nodes); */
error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
error_code = _pick_best_nodes(node_set_ptr, node_set_size,
&select_bitmap, job_ptr,
part_ptr, min_nodes, max_nodes,
req_nodes);
if (error_code) {
if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
/* Required nodes are down or
* too many nodes requested */
debug3("JobId=%u not runnable with present config",
job_ptr->job_id);
if (job_ptr->priority != 0) /* Move to end of queue */
job_ptr->priority = 1;
last_job_update = time(NULL);
} else if (error_code == ESLURM_NODES_BUSY)
slurm_sched_job_is_pending();
goto cleanup;
if (test_only) { /* set if job not highest priority */
slurm_sched_job_is_pending();