Skip to content
Snippets Groups Projects
node_scheduler.c 49.5 KiB
Newer Older
/*****************************************************************************\
 *  node_scheduler.c - select and allocated nodes to jobs 
 *	Note: there is a global node table (node_record_table_ptr) 
 *****************************************************************************
 *  Copyright (C) 2002-2006 The Regents of the University of California.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Morris Jette <jette1@llnl.gov>
 *  UCRL-CODE-217948.
 *  
 *  This file is part of SLURM, a resource management program.
 *  For details, see <http://www.llnl.gov/linux/slurm/>.
 *  
 *  SLURM is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission 
 *  to link the code of portions of this program with the OpenSSL library under 
 *  certain conditions as described in each individual source file, and 
 *  distribute linked combinations including the two. You must obey the GNU 
 *  General Public License in all respects for all of the code used other than 
 *  OpenSSL. If you modify file(s) with this exception, you may extend this 
 *  exception to your version of the file(s), but you are not obligated to do 
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in 
 *  the program, then also delete it here.
 *
 *  In addition, as a special exception, the copyright holders give permission 
 *  to link the code of portions of this program with the OpenSSL library under 
 *  certain conditions as described in each individual source file, and 
 *  distribute linked combinations including the two. You must obey the GNU 
 *  General Public License in all respects for all of the code used other than 
 *  OpenSSL. If you modify file(s) with this exception, you may extend this 
 *  exception to your version of the file(s), but you are not obligated to do 
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in 
 *  the program, then also delete it here.
 *  
 *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *  
 *  You should have received a copy of the GNU General Public License along
 *  with SLURM; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <slurm/slurm_errno.h>

#include "src/common/hostlist.h"
#include "src/common/node_select.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmctld/agent.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/sched_plugin.h"
#include "src/slurmctld/slurmctld.h"
struct node_set {		/* set of nodes with same configuration */
	uint32_t cpus_per_node;	/* NOTE: This is the minimum count,
				 * if FastSchedule==0 then individual 
				 * nodes within the same configuration 
				 * line (in slurm.conf) can actually 
				 * have different CPU counts */
	uint32_t nodes;
	uint32_t weight;
Moe Jette's avatar
Moe Jette committed
};

static int _add_node_set_info(struct node_set *node_set_ptr, 
			      bitstr_t ** node_bitmap, 
			      int *node_cnt, int *cpu_cnt, int cr_enabled);
static int  _build_node_list(struct job_record *job_ptr, 
			     struct node_set **node_set_pptr,
			     int *node_set_size);
static void _filter_nodes_in_set(struct node_set *node_set_ptr,
				 struct job_details *detail_ptr);
static int _match_feature(char *seek, char *available);
static int _nodes_in_sets(bitstr_t *req_bitmap, 
			  struct node_set * node_set_ptr, 
			  int node_set_size);
static void _node_load_bitmaps(bitstr_t * bitmap, bitstr_t ** no_load_bit, 
			       bitstr_t ** light_load_bit, 
			       bitstr_t ** heavy_load_bit);
static int _pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap, 
			uint32_t min_nodes, uint32_t max_nodes, 
			uint32_t req_nodes, bool test_only);
static int _pick_best_nodes(struct node_set *node_set_ptr,
			    int node_set_size, bitstr_t ** select_bitmap,
			    struct job_record *job_ptr,
			    struct part_record *part_ptr,
			    uint32_t min_nodes, uint32_t max_nodes,
			    uint32_t req_nodes);
static int _valid_features(char *requested, char *available);
/*
 * allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED
 * IN job_ptr - job being allocated resources
 * globals: node_record_count - number of nodes in the system
 *	node_record_table_ptr - pointer to global node table
 *	last_node_update - last update time of node table
extern void allocate_nodes(struct job_record *job_ptr)
	last_node_update = time(NULL);
	for (i = 0; i < node_record_count; i++) {
		if (bit_test(job_ptr->node_bitmap, i))
			make_node_alloc(&node_record_table_ptr[i], job_ptr);
 * count_cpus - report how many cpus are associated with the identified nodes 
 * IN bitmap - map of nodes to tally
 * RET cpu count
 * globals: node_record_count - number of nodes configured
 *	node_record_table_ptr - pointer to global node table
Moe Jette's avatar
Moe Jette committed
 */
extern int count_cpus(unsigned *bitmap)
Moe Jette's avatar
Moe Jette committed

	for (i = 0; i < node_record_count; i++) {
		if (bit_test(bitmap, i) != 1)
		if (slurmctld_conf.fast_schedule)
			sum += node_record_table_ptr[i].config_ptr->cpus;
		else
			sum += node_record_table_ptr[i].cpus;
/*
 * deallocate_nodes - for a given job, deallocate its nodes and make 
 * IN job_ptr - pointer to terminating job (already in some COMPLETING state)
 * IN timeout - true if job exhausted time limit, send REQUEST_KILL_TIMELIMIT
 *	RPC instead of REQUEST_TERMINATE_JOB
 * IN suspended - true if job was already suspended (node's job_run_cnt 
 *	already decremented);
 * globals: node_record_count - number of nodes in the system
 *	node_record_table_ptr - pointer to global node table
 */
extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, 
	kill_job_msg_t *kill_job = NULL;
	agent_arg_t *agent_args = NULL;
	int down_node_cnt = 0;
	xassert(job_ptr);
	xassert(job_ptr->details);
	if (select_g_job_fini(job_ptr) != SLURM_SUCCESS)
		error("select_g_job_fini(%u): %m", job_ptr->job_id);

	agent_args = xmalloc(sizeof(agent_arg_t));
	if (timeout)
		agent_args->msg_type = REQUEST_KILL_TIMELIMIT;
	else
		agent_args->msg_type = REQUEST_TERMINATE_JOB;
	agent_args->retry = 1;
	agent_args->hostlist = hostlist_create("");
	kill_job = xmalloc(sizeof(kill_job_msg_t));
	last_node_update = time(NULL);
	kill_job->job_id  = job_ptr->job_id;
	kill_job->job_uid = job_ptr->user_id;
	kill_job->nodes   = xstrdup(job_ptr->nodes);
	kill_job->time    = time(NULL);
	kill_job->select_jobinfo = select_g_copy_jobinfo(
			job_ptr->select_jobinfo);
	for (i = 0; i < node_record_count; i++) {
		struct node_record *node_ptr = &node_record_table_ptr[i];
		if (bit_test(job_ptr->node_bitmap, i) == 0)
		base_state = node_ptr->node_state & NODE_STATE_BASE;
		if (base_state == NODE_STATE_DOWN) {
			/* Issue the KILL RPC, but don't verify response */
			down_node_cnt++;
			bit_clear(job_ptr->node_bitmap, i);
		make_node_comp(node_ptr, job_ptr, suspended);
#ifdef HAVE_FRONT_END		/* Operate only on front-end */
		if (agent_args->node_count > 0)
			continue;
#endif
		hostlist_push(agent_args->hostlist, node_ptr->name);
	if ((agent_args->node_count - down_node_cnt) == 0)
		job_ptr->job_state &= (~JOB_COMPLETING);
	if (agent_args->node_count == 0) {
		error("Job %u allocated no nodes to be killed on",
		xfree(kill_job->nodes);
		select_g_free_jobinfo(&kill_job->select_jobinfo);
	agent_args->msg_args = kill_job;
 * _match_feature - determine if the desired feature is one of those available
 * IN seek - desired feature
Moe Jette's avatar
Moe Jette committed
 * IN available - comma separated list of available features
 * RET 1 if found, 0 otherwise
static int _match_feature(char *seek, char *available)
	char *tmp_available, *str_ptr3, *str_ptr4;
	int found;
	if (seek == NULL)
		return 1;	/* nothing to look for */
	if (available == NULL)
		return SLURM_SUCCESS;	/* nothing to find */
	tmp_available = xstrdup(available);
	str_ptr3 = (char *) strtok_r(tmp_available, ",", &str_ptr4);
		if (strcmp(seek, str_ptr3) == 0) {	/* we have a match */
		}
		str_ptr3 = (char *) strtok_r(NULL, ",", &str_ptr4);
/*
 * _pick_best_load - Given a specification of scheduling requirements, 
 *	identify the nodes which "best" satisfy the request.
 * 	"best" is defined as the least loaded nodes
 * IN job_ptr - pointer to job being scheduled
 * IN/OUT bitmap - usable nodes are set on input, nodes not required to 
 *	satisfy the request are cleared, other left set
 * IN min_nodes - minimum count of nodes
 * IN max_nodes - maximum count of nodes (0==don't care)
 * IN req_nodes - requested (or desired) count of nodes
 * RET zero on success, EINVAL otherwise
 * globals: node_record_count - count of nodes configured
 *	node_record_table_ptr - pointer to global node table
 * NOTE: bitmap must be a superset of req_nodes at the time that 
 *	_pick_best_load is called
 */
static int
_pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap, 
		uint32_t min_nodes, uint32_t max_nodes, 
		uint32_t req_nodes, bool test_only)
{
	bitstr_t *no_load_bit, *light_load_bit, *heavy_load_bit;
	int error_code;
	_node_load_bitmaps(bitmap, &no_load_bit, &light_load_bit, 
			&heavy_load_bit);
	/* first try to use idle nodes */
	bit_and(bitmap, no_load_bit);
	FREE_NULL_BITMAP(no_load_bit);
	/* always include required nodes or selection algorithm fails,
	 * note that we have already confirmed these nodes are available
	 * to this job */
	if (job_ptr->details && job_ptr->details->req_node_bitmap)
		bit_or(bitmap, job_ptr->details->req_node_bitmap);
	error_code = select_g_job_test(job_ptr, bitmap, 
				       min_nodes, max_nodes, 
				       req_nodes, test_only);

	/* now try to use idle and lightly loaded nodes */
	if (error_code) {
		bit_or(bitmap, light_load_bit);
		error_code = select_g_job_test(job_ptr, bitmap, 
					       min_nodes, max_nodes, 
					       req_nodes, test_only);
	} 
	FREE_NULL_BITMAP(light_load_bit);

	/* now try to use all possible nodes */
	if (error_code) {
		bit_or(bitmap, heavy_load_bit);
		error_code = select_g_job_test(job_ptr, bitmap, 
					       min_nodes, max_nodes, 
					       req_nodes, test_only);
	}
	FREE_NULL_BITMAP(heavy_load_bit);

	return error_code;
}

/* 
 * _node_load_bitmaps - given a bitmap of nodes, create three new bitmaps
 *	indicative of the load on those nodes
 * IN bitmap             - map of nodes to test
 * OUT no_load_bitmap    - nodes from bitmap with no jobs
 * OUT light_load_bitmap - nodes from bitmap with one job
 * OUT heavy_load_bitmap - nodes from bitmap with two or more jobs
 * NOTE: caller must free the created bitmaps
 */
static void
_node_load_bitmaps(bitstr_t * bitmap, bitstr_t ** no_load_bit, 
		bitstr_t ** light_load_bit, bitstr_t ** heavy_load_bit)
{
	int i, load;
	bitoff_t size = bit_size(bitmap);
	bitstr_t *bitmap0 = bit_alloc(size);
	bitstr_t *bitmap1 = bit_alloc(size);
	bitstr_t *bitmap2 = bit_alloc(size);

Loading
Loading full blame...