Newer
Older
/*****************************************************************************\
* node_scheduler.c - select and allocated nodes to jobs
* Note: there is a global node table (node_record_table_ptr)
*****************************************************************************
* Copyright (C) 2002-2006 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
*
* This file is part of SLURM, a resource management program.
* For details, see <http://www.llnl.gov/linux/slurm/>.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <syslog.h>
#include <unistd.h>
#include <slurm/slurm_errno.h>
#include "src/common/hostlist.h"
#include "src/common/node_select.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/sched_plugin.h"
#include "src/slurmctld/slurmctld.h"
#define MAX_RETRIES 10
struct node_set { /* set of nodes with same configuration */
uint32_t cpus_per_node; /* NOTE: This is the minimum count,
* if FastSchedule==0 then individual
* nodes within the same configuration
* line (in slurm.conf) can actually
* have different CPU counts */
int feature;
bitstr_t *my_bitmap;
static int _add_node_set_info(struct node_set *node_set_ptr,
bitstr_t ** node_bitmap,
int *node_cnt, int *cpu_cnt, int cr_enabled);
static int _build_node_list(struct job_record *job_ptr,
struct node_set **node_set_pptr,
int *node_set_size);
static void _filter_nodes_in_set(struct node_set *node_set_ptr,
struct job_details *detail_ptr);
static int _match_feature(char *seek, char *available);
static int _nodes_in_sets(bitstr_t *req_bitmap,
struct node_set * node_set_ptr,
int node_set_size);
static void _node_load_bitmaps(bitstr_t * bitmap, bitstr_t ** no_load_bit,
bitstr_t ** light_load_bit,
bitstr_t ** heavy_load_bit);
static int _pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, bool test_only);
static int _pick_best_nodes(struct node_set *node_set_ptr,
int node_set_size, bitstr_t ** select_bitmap,
struct job_record *job_ptr,
struct part_record *part_ptr,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes);
static int _valid_features(char *requested, char *available);
/*
* allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
* IN job_ptr - job being allocated resources
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
* last_node_update - last update time of node table
extern void allocate_nodes(struct job_record *job_ptr)
last_node_update = time(NULL);
for (i = 0; i < node_record_count; i++) {
if (bit_test(job_ptr->node_bitmap, i))
make_node_alloc(&node_record_table_ptr[i], job_ptr);
return;
}
* count_cpus - report how many cpus are associated with the identified nodes
* IN bitmap - map of nodes to tally
* RET cpu count
* globals: node_record_count - number of nodes configured
* node_record_table_ptr - pointer to global node table
extern int count_cpus(unsigned *bitmap)
int i, sum;
sum = 0;
for (i = 0; i < node_record_count; i++) {
if (bit_test(bitmap, i) != 1)
continue;
if (slurmctld_conf.fast_schedule)
sum += node_record_table_ptr[i].config_ptr->cpus;
else
sum += node_record_table_ptr[i].cpus;
return sum;
}
/*
* deallocate_nodes - for a given job, deallocate its nodes and make

Moe Jette
committed
* their state NODE_STATE_COMPLETING
* IN job_ptr - pointer to terminating job (already in some COMPLETING state)
* IN timeout - true if job exhausted time limit, send REQUEST_KILL_TIMELIMIT
* RPC instead of REQUEST_TERMINATE_JOB
* IN suspended - true if job was already suspended (node's job_run_cnt
* already decremented);
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
*/
extern void deallocate_nodes(struct job_record *job_ptr, bool timeout,
bool suspended)
int i;
kill_job_msg_t *kill_job = NULL;
agent_arg_t *agent_args = NULL;
int down_node_cnt = 0;
uint16_t base_state;
xassert(job_ptr);
xassert(job_ptr->details);
if (select_g_job_fini(job_ptr) != SLURM_SUCCESS)
error("select_g_job_fini(%u): %m", job_ptr->job_id);
agent_args = xmalloc(sizeof(agent_arg_t));

Moe Jette
committed
if (timeout)
agent_args->msg_type = REQUEST_KILL_TIMELIMIT;
else
agent_args->msg_type = REQUEST_TERMINATE_JOB;
kill_job = xmalloc(sizeof(kill_job_msg_t));
last_node_update = time(NULL);
kill_job->job_id = job_ptr->job_id;
kill_job->job_uid = job_ptr->user_id;
kill_job->nodes = xstrdup(job_ptr->nodes);
kill_job->time = time(NULL);
kill_job->select_jobinfo = select_g_copy_jobinfo(
job_ptr->select_jobinfo);
for (i = 0; i < node_record_count; i++) {
struct node_record *node_ptr = &node_record_table_ptr[i];
if (bit_test(job_ptr->node_bitmap, i) == 0)
base_state = node_ptr->node_state & NODE_STATE_BASE;
if (base_state == NODE_STATE_DOWN) {
/* Issue the KILL RPC, but don't verify response */
down_node_cnt++;
bit_clear(job_ptr->node_bitmap, i);
job_ptr->node_cnt--;
make_node_comp(node_ptr, job_ptr, suspended);
#ifdef HAVE_FRONT_END /* Operate only on front-end */
if (agent_args->node_count > 0)
continue;
#endif
hostlist_push(agent_args->hostlist, node_ptr->name);
agent_args->node_count++;
if ((agent_args->node_count - down_node_cnt) == 0)
job_ptr->job_state &= (~JOB_COMPLETING);
if (agent_args->node_count == 0) {
error("Job %u allocated no nodes to be killed on",
job_ptr->job_id);
xfree(kill_job->nodes);
select_g_free_jobinfo(&kill_job->select_jobinfo);
xfree(kill_job);
xfree(agent_args);
return;
agent_queue_request(agent_args);
* _match_feature - determine if the desired feature is one of those available
* IN seek - desired feature
* IN available - comma separated list of available features
* RET 1 if found, 0 otherwise
static int _match_feature(char *seek, char *available)
char *tmp_available, *str_ptr3, *str_ptr4;
int found;
if (seek == NULL)
return 1; /* nothing to look for */
if (available == NULL)
return SLURM_SUCCESS; /* nothing to find */
tmp_available = xstrdup(available);
found = 0;
str_ptr3 = (char *) strtok_r(tmp_available, ",", &str_ptr4);
while (str_ptr3) {
if (strcmp(seek, str_ptr3) == 0) { /* we have a match */
found = 1;
break;
}
str_ptr3 = (char *) strtok_r(NULL, ",", &str_ptr4);
xfree(tmp_available);
return found;
}
/*
* _pick_best_load - Given a specification of scheduling requirements,
* identify the nodes which "best" satisfy the request.
* "best" is defined as the least loaded nodes
* IN job_ptr - pointer to job being scheduled
* IN/OUT bitmap - usable nodes are set on input, nodes not required to
* satisfy the request are cleared, other left set
* IN min_nodes - minimum count of nodes
* IN max_nodes - maximum count of nodes (0==don't care)
* IN req_nodes - requested (or desired) count of nodes
* RET zero on success, EINVAL otherwise
* globals: node_record_count - count of nodes configured
* node_record_table_ptr - pointer to global node table
* NOTE: bitmap must be a superset of req_nodes at the time that
* _pick_best_load is called
*/
static int
_pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, bool test_only)
{
bitstr_t *no_load_bit, *light_load_bit, *heavy_load_bit;
int error_code;
_node_load_bitmaps(bitmap, &no_load_bit, &light_load_bit,
&heavy_load_bit);
/* first try to use idle nodes */
bit_and(bitmap, no_load_bit);
FREE_NULL_BITMAP(no_load_bit);
/* always include required nodes or selection algorithm fails,
* note that we have already confirmed these nodes are available
* to this job */
if (job_ptr->details && job_ptr->details->req_node_bitmap)
bit_or(bitmap, job_ptr->details->req_node_bitmap);
error_code = select_g_job_test(job_ptr, bitmap,
req_nodes, test_only);
/* now try to use idle and lightly loaded nodes */
if (error_code) {
bit_or(bitmap, light_load_bit);
error_code = select_g_job_test(job_ptr, bitmap,
req_nodes, test_only);
}
FREE_NULL_BITMAP(light_load_bit);
/* now try to use all possible nodes */
if (error_code) {
bit_or(bitmap, heavy_load_bit);
error_code = select_g_job_test(job_ptr, bitmap,
req_nodes, test_only);
}
FREE_NULL_BITMAP(heavy_load_bit);
return error_code;
}
/*
* _node_load_bitmaps - given a bitmap of nodes, create three new bitmaps
* indicative of the load on those nodes
* IN bitmap - map of nodes to test
* OUT no_load_bitmap - nodes from bitmap with no jobs
* OUT light_load_bitmap - nodes from bitmap with one job
* OUT heavy_load_bitmap - nodes from bitmap with two or more jobs
* NOTE: caller must free the created bitmaps
*/
static void
_node_load_bitmaps(bitstr_t * bitmap, bitstr_t ** no_load_bit,
bitstr_t ** light_load_bit, bitstr_t ** heavy_load_bit)
{
int i, load;
bitoff_t size = bit_size(bitmap);
bitstr_t *bitmap0 = bit_alloc(size);
bitstr_t *bitmap1 = bit_alloc(size);
bitstr_t *bitmap2 = bit_alloc(size);
Loading
Loading full blame...