backfill.c

/*****************************************************************************\
 *  backfill.c - simple backfill scheduler plugin. 
 *
 *  If a partition is does not have root only access and nodes are not shared
 *  then raise the priority of pending jobs if doing so does not adversely
 *  effect the expected initiation of any higher priority job. We do not alter
 *  a job's required or excluded node list, so this is a conservative 
 *  algorithm.
 *
 *  For example, consider a cluster "lx[01-08]" with one job executing on 
 *  nodes "lx[01-04]". The highest priority pending job requires five nodes 
 *  including "lx05". The next highest priority pending job requires any 
 *  three nodes. Without explicitly forcing the second job to use nodes 
 *  "lx[06-08]", we can't start it without possibly delaying the higher 
 *  priority job.
 *****************************************************************************
 *  Copyright (C) 2003-2007 The Regents of the University of California.
 *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Morris Jette <jette1@llnl.gov>
 *  CODE-OCEC-09-009. All rights reserved.
 *  
 *  This file is part of SLURM, a resource management program.
 *  For details, see <https://computing.llnl.gov/linux/slurm/>.
 *  Please also read the included file: DISCLAIMER.
 *  
 *  SLURM is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission 
 *  to link the code of portions of this program with the OpenSSL library under
 *  certain conditions as described in each individual source file, and 
 *  distribute linked combinations including the two. You must obey the GNU 
 *  General Public License in all respects for all of the code used other than 
 *  OpenSSL. If you modify file(s) with this exception, you may extend this 
 *  exception to your version of the file(s), but you are not obligated to do 
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in 
 *  the program, then also delete it here.
 *  
 *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *  
 *  You should have received a copy of the GNU General Public License along
 *  with SLURM; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>

#include "slurm/slurm.h"
#include "slurm/slurm_errno.h"

#include "src/common/list.h"
#include "src/common/macros.h"
#include "src/common/node_select.h"
#include "src/common/parse_time.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmctld/acct_policy.h"
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/licenses.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/reservation.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmctld/srun_comm.h"
#include "backfill.h"

typedef struct node_space_map {
	time_t begin_time;
	time_t end_time;
	bitstr_t *avail_bitmap;
	int next;	/* next record, by time, zero termination */
} node_space_map_t;
int backfilled_jobs = 0;

/*********************** local variables *********************/
static bool new_work      = false;
static bool stop_backfill = false;
static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;

#ifndef BACKFILL_INTERVAL
#  ifdef HAVE_BG
#    define BACKFILL_INTERVAL	5
#  else
#    define BACKFILL_INTERVAL	10
#  endif
#endif

/* Set __DEBUG to get detailed logging for this thread without 
 * detailed logging for the entire slurmctld daemon */
#define __DEBUG			0

/* Do not attempt to build job/resource/time record for
 * more than MAX_BACKFILL_JOB_CNT records */
#define MAX_BACKFILL_JOB_CNT	100

/* Do not build job/resource/time record for more than this 
 * far in the future, in seconds, currently one day */
#define BACKFILL_WINDOW		(24 * 60 * 60)

/*********************** local functions *********************/
static void _add_reservation(uint32_t start_time, uint32_t end_reserve, 
			     bitstr_t *res_bitmap, 
			     node_space_map_t *node_space, 
			     int *node_space_recs);
static void _attempt_backfill(void);
static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2,
		char *tv_str, int len_tv_str);
static bool _more_work(void);
static int  _num_feature_count(struct job_record *job_ptr);
static int  _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap);
static int  _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap,
		       uint32_t min_nodes, uint32_t max_nodes,
		       uint32_t req_nodes);

#if __DEBUG
/* Log resource allocate table */
static void _dump_node_space_table(node_space_map_t *node_space_ptr)
{
	int i = 0;
	char begin_buf[32], end_buf[32], *node_list;

	info("=========================================");
	while (1) {
		slurm_make_time_str(&node_space_ptr[i].begin_time,
				    begin_buf, sizeof(begin_buf));
		slurm_make_time_str(&node_space_ptr[i].end_time,
				    end_buf, sizeof(end_buf));
		node_list = bitmap2node_name(node_space_ptr[i].avail_bitmap);
		info("Begin:%s End:%s Nodes:%s", begin_buf, end_buf, node_list);
		xfree(node_list);
		if ((i = node_space_ptr[i].next) == 0)
			break;
	}
	info("=========================================");
}
#endif

/*
 * _diff_tv_str - build a string showing the time difference between two times
 * IN tv1 - start of event
 * IN tv2 - end of event
 * OUT tv_str - place to put delta time in format "usec=%ld"
 * IN len_tv_str - size of tv_str in bytes
 */
static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2,
		char *tv_str, int len_tv_str)
{
	long delta_t;
	delta_t  = (tv2->tv_sec  - tv1->tv_sec) * 1000000;
	delta_t +=  tv2->tv_usec - tv1->tv_usec;
	snprintf(tv_str, len_tv_str, "usec=%ld", delta_t);
}

/* test if job has feature count specification */
static int _num_feature_count(struct job_record *job_ptr)
{
	struct job_details *detail_ptr = job_ptr->details;
	int rc = 0;
	ListIterator feat_iter;
	struct feature_record *feat_ptr;

	if (detail_ptr->feature_list == NULL)	/* no constraints */
		return rc;

	feat_iter = list_iterator_create(detail_ptr->feature_list);
	while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) {
		if (feat_ptr->count)
			rc++;
	}
	list_iterator_destroy(feat_iter);

	return rc;
}

/* Attempt to schedule a specific job on specific available nodes
 * IN job_ptr - job to schedule
 * IN/OUT avail_bitmap - nodes available/selected to use
 * RET SLURM_SUCCESS on success, otherwise an error code
 */
static int  _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap,
		       uint32_t min_nodes, uint32_t max_nodes,
		       uint32_t req_nodes)
{
	bitstr_t *tmp_bitmap;
	int rc = SLURM_SUCCESS;
	int feat_cnt = _num_feature_count(job_ptr);

	if (feat_cnt) {
		/* Ideally schedule the job feature by feature,
		 * but I don't want to add that complexity here
		 * right now, so clear the feature counts and try
		 * to schedule. This will work if there is only 
		 * one feature count. It should work fairly well
		 * in cases where there are multiple feature
		 * counts. */
		struct job_details *detail_ptr = job_ptr->details;
		ListIterator feat_iter;
		struct feature_record *feat_ptr;
		int i = 0, list_size;
		uint16_t *feat_cnt_orig = NULL, high_cnt = 0;

		/* Clear the feature counts */
		list_size = list_count(detail_ptr->feature_list);
		feat_cnt_orig = xmalloc(sizeof(uint16_t) * list_size);
		feat_iter = list_iterator_create(detail_ptr->feature_list);
		while ((feat_ptr = 
			(struct feature_record *) list_next(feat_iter))) {
			high_cnt = MAX(high_cnt, feat_ptr->count);
			feat_cnt_orig[i++] = feat_ptr->count;
			feat_ptr->count = 0;
		}
		list_iterator_destroy(feat_iter);

		if ((job_req_node_filter(job_ptr, *avail_bitmap) != 
		     SLURM_SUCCESS) ||
		    (bit_set_count(*avail_bitmap) < high_cnt)) {
			rc = ESLURM_NODES_BUSY;
		} else {
			rc = select_g_job_test(job_ptr, *avail_bitmap, 
					       high_cnt, max_nodes, req_nodes,
					       SELECT_MODE_WILL_RUN);
		}

		/* Restore the feature counts */
		i = 0;
		feat_iter = list_iterator_create(detail_ptr->feature_list);
		while ((feat_ptr = 
			(struct feature_record *) list_next(feat_iter))) {
			feat_ptr->count = feat_cnt_orig[i++];
		}
		list_iterator_destroy(feat_iter);
		xfree(feat_cnt_orig);
	} else {
		/* Try to schedule the job. First on dedicated nodes
		 * then on shared nodes (if so configured). */
		uint16_t orig_shared;
		time_t now = time(NULL);
		orig_shared = job_ptr->details->shared;
		job_ptr->details->shared = 0;
		tmp_bitmap = bit_copy(*avail_bitmap);
		rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes,
				       max_nodes, req_nodes,
				       SELECT_MODE_WILL_RUN);
		job_ptr->details->shared = orig_shared;
		if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) &&
		    (orig_shared != 0)) {
			FREE_NULL_BITMAP(*avail_bitmap);
			*avail_bitmap= tmp_bitmap;
			rc = select_g_job_test(job_ptr, *avail_bitmap, 
					       min_nodes, max_nodes, req_nodes,
					       SELECT_MODE_WILL_RUN);
		} else
			FREE_NULL_BITMAP(tmp_bitmap);
	}

	return rc;

}

/* Terminate backfill_agent */
extern void stop_backfill_agent(void)
{
	stop_backfill = true;
}
/* backfill_agent - detached thread periodically attempts to backfill jobs */
extern void *backfill_agent(void *args)
{
	struct timeval tv1, tv2;
	char tv_str[20], *sched_params, *tmp_ptr;
	time_t now;
	int backfill_interval = 0, i, iter;
	static time_t last_backfill_time = 0;
	/* Read config, and partitions; Write jobs and nodes */
	slurmctld_lock_t all_locks = {
		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };

	sched_params = slurm_get_sched_params();
	if (sched_params && (tmp_ptr=strstr(sched_params, "interval=")))
		backfill_interval = atoi(tmp_ptr+9);
	else
		backfill_interval = BACKFILL_INTERVAL;
	if (backfill_interval < 1) {
		fatal("Invalid backfill scheduler interval: %d", 
		      backfill_interval);
	}

	while (!stop_backfill) {
		iter = (BACKFILL_CHECK_SEC * 1000000) /
		       STOP_CHECK_USEC;
		for (i=0; ((i<iter) && (!stop_backfill)); i++) {
			/* test stop_backfill every 0.1 sec for
			 * 2.0 secs to avoid running continuously */
			usleep(STOP_CHECK_USEC);
		}
		
		now = time(NULL);
		/* Avoid resource fragmentation if important */
		if (job_is_completing())
			continue;
		if ((difftime(now, last_backfill_time) < backfill_interval) ||
		    stop_backfill || (!_more_work()))
			continue;
		last_backfill_time = now;

		gettimeofday(&tv1, NULL);
		lock_slurmctld(all_locks);
		_attempt_backfill();
		unlock_slurmctld(all_locks);
		gettimeofday(&tv2, NULL);
		_diff_tv_str(&tv1, &tv2, tv_str, 20);
#if __DEBUG
		info("backfill: completed, %s", tv_str);
#endif
	}
	return NULL;
}

static void _attempt_backfill(void)
{
	bool filter_root = false;
	struct job_queue *job_queue = NULL;
	int i, j,job_queue_size, node_space_recs;
	struct job_record *job_ptr;
	struct part_record *part_ptr;
	uint32_t end_time, end_reserve, time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	time_t now = time(NULL), start_res;
	node_space_map_t node_space[MAX_BACKFILL_JOB_CNT + 2];

	if (slurm_get_root_filter())
		filter_root = true;

	job_queue_size = build_job_queue(&job_queue);
	if (job_queue_size == 0)
		return;

	sort_job_queue(job_queue, job_queue_size);

	node_space[0].begin_time = now;
	node_space[0].end_time = now + BACKFILL_WINDOW;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
#if __DEBUG
	_dump_node_space_table(node_space);
#endif

	for (i = 0; i < job_queue_size; i++) {
		job_ptr = job_queue[i].job_ptr;
		part_ptr = job_ptr->part_ptr;
#if __DEBUG
		info("backfill test for job %u", job_ptr->job_id);
#endif

		if (part_ptr == NULL) {
			part_ptr = find_part_record(job_ptr->partition);
			xassert(part_ptr);
			job_ptr->part_ptr = part_ptr;
			error("partition pointer reset for job %u, part %s",
			      job_ptr->job_id, job_ptr->partition);
		}
		if ((part_ptr->state_up == 0) ||
		    (part_ptr->node_bitmap == NULL))
		 	continue;
		if ((part_ptr->root_only) && filter_root)
			continue;

		if ((!job_independent(job_ptr)) ||
		    (license_job_test(job_ptr) != SLURM_SUCCESS))
			continue;

		/* Determine minimum and maximum node counts */
		min_nodes = MAX(job_ptr->details->min_nodes,
				part_ptr->min_nodes);
		if (job_ptr->details->max_nodes == 0)
			max_nodes = part_ptr->max_nodes;
		else
			max_nodes = MIN(job_ptr->details->max_nodes,
					part_ptr->max_nodes);
		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
		if (job_ptr->details->max_nodes)
			req_nodes = max_nodes;
		else
			req_nodes = min_nodes;
		if (min_nodes > max_nodes) {
			/* job's min_nodes exceeds partition's max_nodes */
			continue;
		}

		/* Determine job's expected completion time */
		if (job_ptr->time_limit == NO_VAL) {
			if (part_ptr->max_time == INFINITE)
				time_limit = 365 * 24 * 60; /* one year */
			else
				time_limit = part_ptr->max_time;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_ptr->max_time);
		}
		/* Determine impact of any resource reservations */
		FREE_NULL_BITMAP(avail_bitmap);
		start_res = now;
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap);
		if (j != SLURM_SUCCESS)
			continue;
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;

		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		for (j=0; ; ) {
			if (node_space[j].end_time < start_res)
				continue;
			if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap, 
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		if (job_ptr->details->exc_node_bitmap) {
			bit_not(job_ptr->details->exc_node_bitmap);
			bit_and(avail_bitmap, 
				job_ptr->details->exc_node_bitmap);
			bit_not(job_ptr->details->exc_node_bitmap);
		}
		if ((job_ptr->details->req_node_bitmap) &&
		    (!bit_super_set(job_ptr->details->req_node_bitmap,
				    avail_bitmap)))
			continue;	/* required nodes missing */
		if (bit_set_count(avail_bitmap) < min_nodes)
			continue;	/* insufficient nodes remain */
		if (job_req_node_filter(job_ptr, avail_bitmap))
			continue;	/* nodes lack features */

		j = _try_sched(job_ptr, &avail_bitmap, 
			       min_nodes, max_nodes, req_nodes);
		if (j != SLURM_SUCCESS)
			continue;	/* not runable */
		
		job_ptr->start_time = MAX(job_ptr->start_time, start_res);
		if (job_ptr->start_time <= now) {
			int rc = _start_job(job_ptr, resv_bitmap);
			if (rc == ESLURM_ACCOUNTING_POLICY) 
				continue;
			else if (rc != SLURM_SUCCESS)
				/* Planned to start job, but something bad
				 * happended. Reserve nodes where this should
				 * apparently run and try more jobs. */
				continue;
		}
		if (job_ptr->start_time > (now + BACKFILL_WINDOW)) {
			/* Starts too far in the future to worry about */
			continue;
		}

		if (node_space_recs == MAX_BACKFILL_JOB_CNT) {
			/* Already have too many jobs to deal with */
			break;
		}

		/*
		 * Add reservation to scheduling table
		 */
		end_reserve = job_ptr->start_time + (time_limit * 60);
		bit_not(avail_bitmap);
		_add_reservation(job_ptr->start_time, end_reserve, 
				 avail_bitmap, node_space, &node_space_recs);
#if __DEBUG
		_dump_node_space_table(node_space);
#endif
	}
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	for (i=0; ; ) {
		bit_free(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(job_queue);
}

/* Try to start the job on any non-reserved nodes */
static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap)
{
	int rc;
	bitstr_t *orig_exc_nodes = NULL;
	static uint32_t fail_jobid = 0;

	if (job_ptr->details->exc_node_bitmap) {
		orig_exc_nodes = job_ptr->details->exc_node_bitmap;
		bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap);
	} else
		job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap);

	rc = select_nodes(job_ptr, false, NULL);
	bit_free(job_ptr->details->exc_node_bitmap);
	job_ptr->details->exc_node_bitmap = orig_exc_nodes;
	if (rc == SLURM_SUCCESS) {	
		/* job initiated */
		last_job_update = time(NULL);
		info("backfill: Started JobId=%u on %s",
		     job_ptr->job_id, job_ptr->nodes);
		if (job_ptr->batch_flag)
			launch_job(job_ptr);
		else
			srun_allocate(job_ptr->job_id);
		backfilled_jobs++;
#if __DEBUG
		info("backfill: Jobs backfilled: %d", backfilled_jobs);
#endif
	} else if ((job_ptr->job_id != fail_jobid) &&
		   (rc != ESLURM_ACCOUNTING_POLICY)) {
		char *node_list;
		bit_not(resv_bitmap);
		node_list = bitmap2node_name(resv_bitmap);
		/* This happens when a job has sharing disabled and
		 * a selected node is still completing some job, 
		 * which should be a temporary situation. */
		verbose("backfill: Failed to start JobId=%u in %s: %s",
			job_ptr->job_id, node_list, slurm_strerror(rc));
		xfree(node_list);
		fail_jobid = job_ptr->job_id;
	} else {
		debug3("backfill: Failed to start JobId=%u: %s",
		       job_ptr->job_id, slurm_strerror(rc));
	}
	return rc;
}

/* trigger the attempt of a backfill */
extern void run_backfill (void)
{
	pthread_mutex_lock( &thread_flag_mutex );
	new_work = true;
	pthread_mutex_unlock( &thread_flag_mutex );
}

/* Report if any changes occurred to job, node or partition information */
static bool _more_work (void)
{
	bool rc;
	static time_t backfill_job_time  = (time_t) 0;
	static time_t backfill_node_time = (time_t) 0;
	static time_t backfill_part_time = (time_t) 0;

	pthread_mutex_lock( &thread_flag_mutex );
	if ( (backfill_job_time  == last_job_update ) &&
	     (backfill_node_time == last_node_update) &&
	     (backfill_part_time == last_part_update) &&
	     (new_work == false) ) {
		rc = false;
	} else {
		backfill_job_time  = last_job_update;
		backfill_node_time = last_node_update;
		backfill_part_time = last_part_update;
		new_work = false;
		rc = true;
	}
	pthread_mutex_unlock( &thread_flag_mutex );
	return rc;
}

/* Create a reservation for a job in the future */
static void _add_reservation(uint32_t start_time, uint32_t end_reserve, 
			     bitstr_t *res_bitmap, 
			     node_space_map_t *node_space, 
			     int *node_space_recs)
{
	int i, j;

	for (j=0; ; ) {
		if (node_space[j].end_time > start_time) {
			/* insert start entry record */
			i = *node_space_recs;
			node_space[i].begin_time = start_time;
			node_space[i].end_time = node_space[j].end_time;
			node_space[j].end_time = start_time;
			node_space[i].avail_bitmap = 
				bit_copy(node_space[j].avail_bitmap);
			node_space[i].next = node_space[j].next;
			node_space[j].next = i;
			(*node_space_recs)++;
			break;
		}
		if (node_space[j].end_time == start_time) {
			/* no need to insert start entry record */
			break;
		}
		if ((j = node_space[j].next) == 0)
			break;
	}

	for (j=0; ; ) {
		if (node_space[j].begin_time >= start_time)
			bit_and(node_space[j].avail_bitmap, res_bitmap);
		if ((j = node_space[j].next) == 0)
			break;
	}
}