backfill.c

/*****************************************************************************\
 *  backfill.c - simple backfill scheduler plugin.
 *
 *  If a partition does not have root only access and nodes are not shared
 *  then raise the priority of pending jobs if doing so does not adversely
 *  effect the expected initiation of any higher priority job. We do not alter
 *  a job's required or excluded node list, so this is a conservative
 *  algorithm.
 *
 *  For example, consider a cluster "lx[01-08]" with one job executing on
 *  nodes "lx[01-04]". The highest priority pending job requires five nodes
 *  including "lx05". The next highest priority pending job requires any
 *  three nodes. Without explicitly forcing the second job to use nodes
 *  "lx[06-08]", we can't start it without possibly delaying the higher
 *  priority job.
 *****************************************************************************
 *  Copyright (C) 2003-2007 The Regents of the University of California.
 *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Morris Jette <jette1@llnl.gov>
 *  CODE-OCEC-09-009. All rights reserved.
 *
 *  This file is part of SLURM, a resource management program.
 *  For details, see <https://slurm.schedmd.com/>.
 *  Please also read the included file: DISCLAIMER.
 *
 *  SLURM is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission
 *  to link the code of portions of this program with the OpenSSL library under
 *  certain conditions as described in each individual source file, and
 *  distribute linked combinations including the two. You must obey the GNU
 *  General Public License in all respects for all of the code used other than
 *  OpenSSL. If you modify file(s) with this exception, you may extend this
 *  exception to your version of the file(s), but you are not obligated to do
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in
 *  the program, then also delete it here.
 *
 *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with SLURM; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/

#include "config.h"

#if HAVE_SYS_PRCTL_H
#  include <sys/prctl.h>
#endif

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>

#include "slurm/slurm.h"
#include "slurm/slurmdb.h"
#include "slurm/slurm_errno.h"

#include "src/common/assoc_mgr.h"
#include "src/common/gres.h"
#include "src/common/list.h"
#include "src/common/macros.h"
#include "src/common/node_features.h"
#include "src/common/node_select.h"
#include "src/common/parse_time.h"
#include "src/common/power.h"
#include "src/common/read_config.h"
#include "src/common/slurm_accounting_storage.h"
#include "src/common/slurm_mcs.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"

#include "src/slurmctld/acct_policy.h"
#include "src/slurmctld/burst_buffer.h"
#include "src/slurmctld/fed_mgr.h"
#include "src/slurmctld/front_end.h"
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/licenses.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/preempt.h"
#include "src/slurmctld/reservation.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmctld/srun_comm.h"
#include "backfill.h"

#define BACKFILL_INTERVAL	30
#define BACKFILL_RESOLUTION	60
#define BACKFILL_WINDOW		(24 * 60 * 60)
#define BF_MAX_USERS		5000
#define BF_MAX_JOB_ARRAY_RESV	20

#define SLURMCTLD_THREAD_LIMIT	5
#define SCHED_TIMEOUT		2000000	/* time in micro-seconds */
#define YIELD_SLEEP		500000;	/* time in micro-seconds */

typedef struct node_space_map {
	time_t begin_time;
	time_t end_time;
	bitstr_t *avail_bitmap;
	int next;	/* next record, by time, zero termination */
} node_space_map_t;

/*
 * Pack job scheduling structures
 * NOTE: An individial pack job component can be submitted to multiple
 *       partitions and have different start times in each
 */
typedef struct pack_job_rec {
	uint32_t job_id;
	struct job_record *job_ptr;
	time_t latest_start;		/* Time when expected to start */
	struct part_record *part_ptr;
} pack_job_rec_t;

typedef struct pack_job_map {
	uint32_t comp_time_limit;	/* Time limit for pack job */
	time_t prev_start;		/* Time when expected to start from last test */
	uint32_t pack_job_id;
	List pack_job_list;		/* List of pack_job_rec_t */
} pack_job_map_t;

typedef struct user_part_rec {
	uint16_t *njobs;
	struct part_record *part_ptr;
	uint32_t *uid;
	int user_cnt;
} user_part_rec_t;

/* Diag statistics */
extern diag_stats_t slurmctld_diag_stats;
uint32_t bf_sleep_usec = 0;

/*********************** local variables *********************/
static bool stop_backfill = false;
static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t  term_cond = PTHREAD_COND_INITIALIZER;
static pthread_mutex_t config_lock = PTHREAD_MUTEX_INITIALIZER;
static bool config_flag = false;
static uint64_t debug_flags = 0;
static int backfill_interval = BACKFILL_INTERVAL;
static int bf_max_time = BACKFILL_INTERVAL;
static int backfill_resolution = BACKFILL_RESOLUTION;
static int backfill_window = BACKFILL_WINDOW;
static int bf_job_part_count_reserve = 0;
static int bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV;
static int bf_min_age_reserve = 0;
static uint32_t bf_min_prio_reserve = 0;
static int max_backfill_job_cnt = 100;
static int max_backfill_job_per_assoc = 0;
static int max_backfill_job_per_part = 0;
static int max_backfill_job_per_user = 0;
static int max_backfill_job_per_user_part = 0;
static int max_backfill_jobs_start = 0;
static bool backfill_continue = false;
static bool assoc_limit_stop = false;
static int defer_rpc_cnt = 0;
static int sched_timeout = SCHED_TIMEOUT;
static int yield_sleep   = YIELD_SLEEP;
static List pack_job_list = NULL;

/*********************** local functions *********************/
static void _add_reservation(uint32_t start_time, uint32_t end_reserve,
			     bitstr_t *res_bitmap,
			     node_space_map_t *node_space,
			     int *node_space_recs);
static int  _attempt_backfill(void);
static int  _clear_job_start_times(void *x, void *arg);
static int  _clear_qos_blocked_times(void *x, void *arg);
static void _do_diag_stats(struct timeval *tv1, struct timeval *tv2);
static uint32_t _get_job_max_tl(struct job_record *job_ptr, time_t now,
				node_space_map_t *node_space);
static bool _job_part_valid(struct job_record *job_ptr,
			    struct part_record *part_ptr);
static void _load_config(void);
static bool _many_pending_rpcs(void);
static bool _more_work(time_t last_backfill_time);
static uint32_t _my_sleep(int usec);
static int  _num_feature_count(struct job_record *job_ptr, bool *has_xor);
static int  _pack_find_map(void *x, void *key);
static void _pack_map_del(void *x);
static void _pack_rec_del(void *x);
static void _pack_start_clear(void);
static time_t _pack_start_find(struct job_record *job_ptr, time_t now);
static void _pack_start_set(struct job_record *job_ptr, time_t latest_start,
			    uint32_t comp_time_limit);
static void _pack_start_test(node_space_map_t *node_space);
static void _reset_job_time_limit(struct job_record *job_ptr, time_t now,
				  node_space_map_t *node_space);
static int  _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap);
static bool _test_resv_overlap(node_space_map_t *node_space,
			       bitstr_t *use_bitmap, uint32_t start_time,
			       uint32_t end_reserve);
static int  _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap,
		       uint32_t min_nodes, uint32_t max_nodes,
		       uint32_t req_nodes, bitstr_t *exc_core_bitmap);
static int  _yield_locks(int usec);

/* Log resources to be allocated to a pending job */
static void _dump_job_sched(struct job_record *job_ptr, time_t end_time,
			    bitstr_t *avail_bitmap)
{
	char begin_buf[32], end_buf[32], *node_list;

	slurm_make_time_str(&job_ptr->start_time, begin_buf, sizeof(begin_buf));
	slurm_make_time_str(&end_time, end_buf, sizeof(end_buf));
	node_list = bitmap2node_name(avail_bitmap);
	info("Job %u to start at %s, end at %s on %s",
	     job_ptr->job_id, begin_buf, end_buf, node_list);
	xfree(node_list);
}

static void _dump_job_test(struct job_record *job_ptr, bitstr_t *avail_bitmap,
			   time_t start_time)
{
	char begin_buf[32], *node_list;

	if (start_time == 0)
		strcpy(begin_buf, "NOW");
	else
		slurm_make_time_str(&start_time, begin_buf, sizeof(begin_buf));
	node_list = bitmap2node_name(avail_bitmap);
	info("Test job %u at %s on %s", job_ptr->job_id, begin_buf, node_list);
	xfree(node_list);
}

/* Log resource allocate table */
static void _dump_node_space_table(node_space_map_t *node_space_ptr)
{
	int i = 0;
	char begin_buf[32], end_buf[32], *node_list;

	info("=========================================");
	while (1) {
		slurm_make_time_str(&node_space_ptr[i].begin_time,
				    begin_buf, sizeof(begin_buf));
		slurm_make_time_str(&node_space_ptr[i].end_time,
				    end_buf, sizeof(end_buf));
		node_list = bitmap2node_name(node_space_ptr[i].avail_bitmap);
		info("Begin:%s End:%s Nodes:%s",
		     begin_buf, end_buf, node_list);
		xfree(node_list);
		if ((i = node_space_ptr[i].next) == 0)
			break;
	}
	info("=========================================");
}

static void _set_job_time_limit(struct job_record *job_ptr, uint32_t new_limit)
{
	job_ptr->time_limit = new_limit;
	/* reset flag if we have a NO_VAL time_limit */
	if (job_ptr->time_limit == NO_VAL)
		job_ptr->limit_set.time = 0;

}

/*
 * _many_pending_rpcs - Determine if slurmctld is busy with many active RPCs
 * RET - True if slurmctld currently has more than SLURMCTLD_THREAD_LIMIT
 *	 active RPCs
 */
static bool _many_pending_rpcs(void)
{
	//info("thread_count = %u", slurmctld_config.server_thread_count);
	if ((defer_rpc_cnt > 0) &&
	    (slurmctld_config.server_thread_count >= defer_rpc_cnt))
		return true;
	return false;
}

/* test if job has feature count specification */
static int _num_feature_count(struct job_record *job_ptr, bool *has_xor)
{
	struct job_details *detail_ptr = job_ptr->details;
	int rc = 0;
	ListIterator feat_iter;
	job_feature_t *feat_ptr;

	if (detail_ptr->feature_list == NULL)	/* no constraints */
		return rc;

	feat_iter = list_iterator_create(detail_ptr->feature_list);
	while ((feat_ptr = (job_feature_t *) list_next(feat_iter))) {
		if (feat_ptr->count)
			rc++;
		if (feat_ptr->op_code == FEATURE_OP_XOR)
			*has_xor = true;
	}
	list_iterator_destroy(feat_iter);

	return rc;
}

static int _clear_qos_blocked_times(void *x, void *arg)
{
	slurmdb_qos_rec_t *qos_ptr = (slurmdb_qos_rec_t *) x;
	qos_ptr->blocked_until = 0;

	return 0;
}

/* Attempt to schedule a specific job on specific available nodes
 * IN job_ptr - job to schedule
 * IN/OUT avail_bitmap - nodes available/selected to use
 * IN exc_core_bitmap - cores which can not be used
 * RET SLURM_SUCCESS on success, otherwise an error code
 */
static int  _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap,
		       uint32_t min_nodes, uint32_t max_nodes,
		       uint32_t req_nodes, bitstr_t *exc_core_bitmap)
{
	bitstr_t *low_bitmap = NULL, *tmp_bitmap = NULL;
	int rc = SLURM_SUCCESS;
	bool has_xor = false;
	int feat_cnt = _num_feature_count(job_ptr, &has_xor);
	struct job_details *detail_ptr = job_ptr->details;
	List preemptee_candidates = NULL;
	List preemptee_job_list = NULL;
	ListIterator feat_iter;
	job_feature_t *feat_ptr;

	if (feat_cnt) {
		/* Ideally schedule the job feature by feature,
		 * but I don't want to add that complexity here
		 * right now, so clear the feature counts and try
		 * to schedule. This will work if there is only
		 * one feature count. It should work fairly well
		 * in cases where there are multiple feature
		 * counts. */
		int i = 0, list_size;
		uint16_t *feat_cnt_orig = NULL, high_cnt = 0;

		/* Clear the feature counts */
		list_size = list_count(detail_ptr->feature_list);
		feat_cnt_orig = xmalloc(sizeof(uint16_t) * list_size);
		feat_iter = list_iterator_create(detail_ptr->feature_list);
		while ((feat_ptr = (job_feature_t *) list_next(feat_iter))) {
			high_cnt = MAX(high_cnt, feat_ptr->count);
			feat_cnt_orig[i++] = feat_ptr->count;
			feat_ptr->count = 0;
		}
		list_iterator_destroy(feat_iter);

		if ((job_req_node_filter(job_ptr, *avail_bitmap, true) !=
		     SLURM_SUCCESS) ||
		    (bit_set_count(*avail_bitmap) < high_cnt)) {
			rc = ESLURM_NODES_BUSY;
		} else {
			preemptee_candidates =
				slurm_find_preemptable_jobs(job_ptr);
			rc = select_g_job_test(job_ptr, *avail_bitmap,
					       high_cnt, max_nodes, req_nodes,
					       SELECT_MODE_WILL_RUN,
					       preemptee_candidates,
					       &preemptee_job_list,
					       exc_core_bitmap);
			FREE_NULL_LIST(preemptee_job_list);
		}

		/* Restore the feature counts */
		i = 0;
		feat_iter = list_iterator_create(detail_ptr->feature_list);
		while ((feat_ptr = (job_feature_t *) list_next(feat_iter))) {
			feat_ptr->count = feat_cnt_orig[i++];
		}
		list_iterator_destroy(feat_iter);
		xfree(feat_cnt_orig);
	} else if (has_xor) {
		/* Cache the feature information and test the individual
		 * features, one at a time */
		job_feature_t feature_base;
		List feature_cache = detail_ptr->feature_list;
		time_t low_start = 0;

		detail_ptr->feature_list = list_create(NULL);
		feature_base.count = 0;
		feature_base.op_code = FEATURE_OP_END;
		list_append(detail_ptr->feature_list, &feature_base);

		tmp_bitmap = bit_copy(*avail_bitmap);
		feat_iter = list_iterator_create(feature_cache);
		while ((feat_ptr = (job_feature_t *) list_next(feat_iter))) {
			feature_base.name = feat_ptr->name;
			if ((job_req_node_filter(job_ptr, *avail_bitmap, true)
			     == SLURM_SUCCESS) &&
			    (bit_set_count(*avail_bitmap) >= min_nodes)) {
				preemptee_candidates =
					slurm_find_preemptable_jobs(job_ptr);
				rc = select_g_job_test(job_ptr, *avail_bitmap,
						       min_nodes, max_nodes,
						       req_nodes,
						       SELECT_MODE_WILL_RUN,
						       preemptee_candidates,
						       &preemptee_job_list,
						       exc_core_bitmap);
				FREE_NULL_LIST(preemptee_job_list);
				if ((rc == SLURM_SUCCESS) &&
				    ((low_start == 0) ||
				     (low_start > job_ptr->start_time))) {
					low_start = job_ptr->start_time;
					low_bitmap = *avail_bitmap;
					*avail_bitmap = NULL;
				}
			}
			FREE_NULL_BITMAP(*avail_bitmap);
			*avail_bitmap = bit_copy(tmp_bitmap);
		}
		list_iterator_destroy(feat_iter);
		FREE_NULL_BITMAP(tmp_bitmap);
		if (low_start) {
			job_ptr->start_time = low_start;
			rc = SLURM_SUCCESS;
			*avail_bitmap = low_bitmap;
		} else {
			rc = ESLURM_NODES_BUSY;
			FREE_NULL_BITMAP(low_bitmap);
		}

		/* Restore the original feature information */
		list_destroy(detail_ptr->feature_list);
		detail_ptr->feature_list = feature_cache;
	} else if (detail_ptr->feature_list) {
		if ((job_req_node_filter(job_ptr, *avail_bitmap, true) !=
		     SLURM_SUCCESS) ||
		    (bit_set_count(*avail_bitmap) < min_nodes)) {
			rc = ESLURM_NODES_BUSY;
		} else {
			preemptee_candidates =
					slurm_find_preemptable_jobs(job_ptr);
			rc = select_g_job_test(job_ptr, *avail_bitmap,
					       min_nodes, max_nodes, req_nodes,
					       SELECT_MODE_WILL_RUN,
					       preemptee_candidates,
					       &preemptee_job_list,
					       exc_core_bitmap);
			FREE_NULL_LIST(preemptee_job_list);
		}
	} else {
		/* Try to schedule the job. First on dedicated nodes
		 * then on shared nodes (if so configured). */
		uint16_t orig_shared;
		time_t now = time(NULL);
		char str[100];

		preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);
		orig_shared = job_ptr->details->share_res;
		job_ptr->details->share_res = 0;
		tmp_bitmap = bit_copy(*avail_bitmap);

		if (exc_core_bitmap) {
			bit_fmt(str, (sizeof(str) - 1), exc_core_bitmap);
			debug2("%s exclude core bitmap: %s", __func__, str);
		}

		rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes,
				       max_nodes, req_nodes,
				       SELECT_MODE_WILL_RUN,
				       preemptee_candidates,
				       &preemptee_job_list,
				       exc_core_bitmap);
		FREE_NULL_LIST(preemptee_job_list);

		job_ptr->details->share_res = orig_shared;

		if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) &&
		    (orig_shared != 0)) {
			FREE_NULL_BITMAP(*avail_bitmap);
			*avail_bitmap = tmp_bitmap;
			rc = select_g_job_test(job_ptr, *avail_bitmap,
					       min_nodes, max_nodes, req_nodes,
					       SELECT_MODE_WILL_RUN,
					       preemptee_candidates,
					       &preemptee_job_list,
					       exc_core_bitmap);
			FREE_NULL_LIST(preemptee_job_list);
		} else
			FREE_NULL_BITMAP(tmp_bitmap);
	}

	FREE_NULL_LIST(preemptee_candidates);
	return rc;
}

/* Terminate backfill_agent */
extern void stop_backfill_agent(void)
{
	slurm_mutex_lock(&term_lock);
	stop_backfill = true;
	slurm_cond_signal(&term_cond);
	slurm_mutex_unlock(&term_lock);
}

/* Sleep for at least specified time, returns actual sleep time in usec */
static uint32_t _my_sleep(int usec)
{
	int64_t nsec;
	uint32_t sleep_time = 0;
	struct timespec ts = {0, 0};
	struct timeval  tv1 = {0, 0}, tv2 = {0, 0};

	if (gettimeofday(&tv1, NULL)) {		/* Some error */
		sleep(1);
		return 1000000;
	}

	nsec  = tv1.tv_usec + usec;
	nsec *= 1000;
	ts.tv_sec  = tv1.tv_sec + (nsec / 1000000000);
	ts.tv_nsec = nsec % 1000000000;
	slurm_mutex_lock(&term_lock);
	if (!stop_backfill)
		slurm_cond_timedwait(&term_cond, &term_lock, &ts);
	slurm_mutex_unlock(&term_lock);
	if (gettimeofday(&tv2, NULL))
		return usec;
	sleep_time = (tv2.tv_sec - tv1.tv_sec) * 1000000;
	sleep_time += tv2.tv_usec;
	sleep_time -= tv1.tv_usec;
	return sleep_time;
}

static void _load_config(void)
{
	char *sched_params, *tmp_ptr;

	sched_params = slurm_get_sched_params();
	debug_flags  = slurm_get_debug_flags();

	if (sched_params && (tmp_ptr = strstr(sched_params, "bf_interval="))) {
		backfill_interval = atoi(tmp_ptr + 12);
		if (backfill_interval < 1) {
			error("Invalid SchedulerParameters bf_interval: %d",
			      backfill_interval);
			backfill_interval = BACKFILL_INTERVAL;
		}
	} else {
		backfill_interval = BACKFILL_INTERVAL;
	}

	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_max_time="))) {
		bf_max_time = atoi(tmp_ptr + 12);
		if (bf_max_time < 1 ) {
			error("Invalid SchedulerParameters bf_max_time:"
			      " %d", bf_max_time);
			bf_max_time = backfill_interval;
		}
	} else {
		bf_max_time = backfill_interval;
	}

	if (sched_params && (tmp_ptr = strstr(sched_params, "bf_window="))) {
		backfill_window = atoi(tmp_ptr + 10) * 60;  /* mins to secs */
		if (backfill_window < 1) {
			error("Invalid SchedulerParameters bf_window: %d",
			      backfill_window);
			backfill_window = BACKFILL_WINDOW;
		}
	} else {
		backfill_window = BACKFILL_WINDOW;
	}

	/* "max_job_bf" replaced by "bf_max_job_test" in version 14.03 and
	 * can be removed later. Only "bf_max_job_test" is documented. */
	if (sched_params && (tmp_ptr=strstr(sched_params, "bf_max_job_test=")))
		max_backfill_job_cnt = atoi(tmp_ptr + 16);
	else if (sched_params && (tmp_ptr=strstr(sched_params, "max_job_bf=")))
		max_backfill_job_cnt = atoi(tmp_ptr + 11);
	else
		max_backfill_job_cnt = 100;
	if (max_backfill_job_cnt < 1) {
		error("Invalid SchedulerParameters bf_max_job_test: %d",
		      max_backfill_job_cnt);
		max_backfill_job_cnt = 100;
	}

	if (sched_params && (tmp_ptr=strstr(sched_params, "bf_resolution="))) {
		backfill_resolution = atoi(tmp_ptr + 14);
		if (backfill_resolution < 1) {
			error("Invalid SchedulerParameters bf_resolution: %d",
			      backfill_resolution);
			backfill_resolution = BACKFILL_RESOLUTION;
		}
	} else {
		backfill_resolution = BACKFILL_RESOLUTION;
	}

	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_max_job_array_resv="))) {
		bf_max_job_array_resv = atoi(tmp_ptr + 22);
		if (bf_max_job_array_resv < 0) {
			error("Invalid SchedulerParameters bf_max_job_array_resv: %d",
			      bf_max_job_array_resv);
			bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV;
		}
	} else {
		bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV;
	}

	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_max_job_part="))) {
		max_backfill_job_per_part = atoi(tmp_ptr + 16);
		if (max_backfill_job_per_part < 0) {
			error("Invalid SchedulerParameters bf_max_job_part: %d",
			      max_backfill_job_per_part);
			max_backfill_job_per_part = 0;
		}
	} else {
		max_backfill_job_per_part = 0;
	}
	if ((max_backfill_job_per_part != 0) &&
	    (max_backfill_job_per_part >= max_backfill_job_cnt)) {
		error("bf_max_job_part >= bf_max_job_test (%u >= %u)",
		      max_backfill_job_per_part, max_backfill_job_cnt);
	}
	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_max_job_start="))) {
		max_backfill_jobs_start = atoi(tmp_ptr + 17);
		if (max_backfill_jobs_start < 0) {
			error("Invalid SchedulerParameters bf_max_job_start: %d",
			      max_backfill_jobs_start);
			max_backfill_jobs_start = 0;
		}
	} else {
		max_backfill_jobs_start = 0;
	}

	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_max_job_user="))) {
		max_backfill_job_per_user = atoi(tmp_ptr + 16);
		if (max_backfill_job_per_user < 0) {
			error("Invalid SchedulerParameters bf_max_job_user: %d",
			      max_backfill_job_per_user);
			max_backfill_job_per_user = 0;
		}
	} else {
		max_backfill_job_per_user = 0;
	}
	if ((max_backfill_job_per_user != 0) &&
	    (max_backfill_job_per_user > max_backfill_job_cnt)) {
		info("warning: bf_max_job_user > bf_max_job_test (%u > %u)",
		     max_backfill_job_per_user, max_backfill_job_cnt);
	}

	bf_job_part_count_reserve = 0;
	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_job_part_count_reserve="))) {
		int job_cnt = atoi(tmp_ptr + 26);
		if (job_cnt < 0) {
			error("Invalid SchedulerParameters bf_job_part_count_reserve: %d",
			      job_cnt);
		} else {
			bf_job_part_count_reserve = job_cnt;
		}
	}

	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_max_job_user_part="))) {
		max_backfill_job_per_user_part = atoi(tmp_ptr + 21);
		if (max_backfill_job_per_user_part < 0) {
			error("Invalid SchedulerParameters bf_max_job_user_part: %d",
			      max_backfill_job_per_user_part);
			max_backfill_job_per_user_part = 0;
		}
	} else {
		max_backfill_job_per_user_part = 0;
	}
	if ((max_backfill_job_per_user_part != 0) &&
	    (max_backfill_job_per_user_part > max_backfill_job_cnt)) {
		info("warning: bf_max_job_user_part > bf_max_job_test (%u > %u)",
		     max_backfill_job_per_user_part, max_backfill_job_cnt);
	}


	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_max_job_assoc="))) {
		max_backfill_job_per_assoc = atoi(tmp_ptr + 17);
		if (max_backfill_job_per_assoc < 0) {
			error("Invalid SchedulerParameters bf_max_job_assoc: %d",
			      max_backfill_job_per_assoc);
			max_backfill_job_per_assoc = 0;
		}
	} else {
		max_backfill_job_per_assoc = 0;
	}
	if ((max_backfill_job_per_assoc != 0) &&
	    (max_backfill_job_per_assoc > max_backfill_job_cnt)) {
		info("warning: bf_max_job_assoc > bf_max_job_test (%u > %u)",
		     max_backfill_job_per_assoc, max_backfill_job_cnt);
	}
	if ((max_backfill_job_per_assoc != 0) &&
	    (max_backfill_job_per_user != 0)) {
		error("Both bf_max_job_user and bf_max_job_assoc are set: "
		      "bf_max_job_assoc taking precedence.");
		max_backfill_job_per_user = 0;
	}
	if ((max_backfill_job_per_assoc != 0) &&
	    (BF_MAX_USERS < g_user_assoc_count)) {
		info("warning: BF_MAX_USERS < g_user_assoc_count(%u), consider increasing",
		     g_user_assoc_count);
	}
	bf_min_age_reserve = 0;
	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) {
		int min_age = atoi(tmp_ptr + 19);
		if (min_age < 0) {
			error("Invalid SchedulerParameters bf_min_age_reserve: %d",
			      min_age);
		} else {
			bf_min_age_reserve = min_age;
		}
	}

	bf_min_prio_reserve = 0;
	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_min_prio_reserve="))) {
		int64_t min_prio = (int64_t) atoll(tmp_ptr + 20);
		if (min_prio < 0) {
			error("Invalid SchedulerParameters bf_min_prio_reserve: %"PRIi64,
			      min_prio);
		} else {
			bf_min_prio_reserve = (uint32_t) min_prio;
		}
	}

	/* bf_continue makes backfill continue where it was if interrupted */
	if (sched_params && (strstr(sched_params, "bf_continue"))) {
		backfill_continue = true;
	} else {
		backfill_continue = false;
	}

	if (sched_params && (strstr(sched_params, "assoc_limit_stop"))) {
		assoc_limit_stop = true;
	} else {
		assoc_limit_stop = false;
	}


	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_yield_interval="))) {
		sched_timeout = atoi(tmp_ptr + 18);
		if (sched_timeout <= 0) {
			error("Invalid backfill scheduler bf_yield_interval: %d",
			      sched_timeout);
			sched_timeout = SCHED_TIMEOUT;
		}
	} else {
		sched_timeout = SCHED_TIMEOUT;
	}

	if (sched_params &&
	    (tmp_ptr = strstr(sched_params, "bf_yield_sleep="))) {
		yield_sleep = atoi(tmp_ptr + 15);
		if (yield_sleep <= 0) {
			error("Invalid backfill scheduler bf_yield_sleep: %d",
			      yield_sleep);
			yield_sleep = YIELD_SLEEP;
		}
	} else {
		yield_sleep = YIELD_SLEEP;
	}

	if (sched_params && (tmp_ptr = strstr(sched_params, "max_rpc_cnt=")))
		defer_rpc_cnt = atoi(tmp_ptr + 12);
	else if (sched_params &&
		 (tmp_ptr = strstr(sched_params, "max_rpc_count=")))
		defer_rpc_cnt = atoi(tmp_ptr + 14);
	else
		defer_rpc_cnt = 0;
	if (defer_rpc_cnt < 0) {
		error("Invalid SchedulerParameters max_rpc_cnt: %d",
		      defer_rpc_cnt);
		defer_rpc_cnt = 0;
	}

	xfree(sched_params);
}

/* Note that slurm.conf has changed */
extern void backfill_reconfig(void)
{
	slurm_mutex_lock(&config_lock);
	config_flag = true;
	slurm_mutex_unlock(&config_lock);
}

/* Update backfill scheduling statistics
 * IN tv1 - start time
 * IN tv2 - end (current) time
 */
static void _do_diag_stats(struct timeval *tv1, struct timeval *tv2)
{
	uint32_t delta_t, real_time;

	delta_t  = (tv2->tv_sec - tv1->tv_sec) * 1000000;
	delta_t +=  tv2->tv_usec;
	delta_t -=  tv1->tv_usec;
	real_time = delta_t - bf_sleep_usec;

	slurmctld_diag_stats.bf_cycle_counter++;
	slurmctld_diag_stats.bf_cycle_sum += real_time;
	slurmctld_diag_stats.bf_cycle_last = real_time;

	slurmctld_diag_stats.bf_depth_sum += slurmctld_diag_stats.bf_last_depth;
	slurmctld_diag_stats.bf_depth_try_sum +=
		slurmctld_diag_stats.bf_last_depth_try;
	if (slurmctld_diag_stats.bf_cycle_last >
	    slurmctld_diag_stats.bf_cycle_max) {
		slurmctld_diag_stats.bf_cycle_max = slurmctld_diag_stats.
						    bf_cycle_last;
	}

	slurmctld_diag_stats.bf_active = 0;
}


/* backfill_agent - detached thread periodically attempts to backfill jobs */
extern void *backfill_agent(void *args)
{
	time_t now;
	double wait_time;
	static time_t last_backfill_time = 0;
	/* Read config and partitions; Write jobs and nodes */
	slurmctld_lock_t all_locks = {
		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
	bool load_config;
	bool short_sleep = false;
	int backfill_cnt = 0;

#if HAVE_SYS_PRCTL_H
	if (prctl(PR_SET_NAME, "bckfl", NULL, NULL, NULL) < 0) {
		error("%s: cannot set my name to %s %m", __func__, "backfill");
	}
#endif
	_load_config();
	last_backfill_time = time(NULL);
	pack_job_list = list_create(_pack_map_del);
	while (!stop_backfill) {
		if (short_sleep)
			_my_sleep(1000000);
		else
			_my_sleep(backfill_interval * 1000000);
		if (stop_backfill)
			break;

		if (slurmctld_config.scheduling_disabled)
			continue;

		slurm_mutex_lock(&config_lock);
		if (config_flag) {
			config_flag = false;
			load_config = true;
		} else {
			load_config = false;
		}
		slurm_mutex_unlock(&config_lock);
		if (load_config)
			_load_config();
		now = time(NULL);
		wait_time = difftime(now, last_backfill_time);
		if ((wait_time < backfill_interval) ||
		    job_is_completing(NULL) || _many_pending_rpcs() ||
		    !avail_front_end(NULL) || !_more_work(last_backfill_time)) {
			short_sleep = true;
			continue;
		}
		lock_slurmctld(all_locks);
		if ((backfill_cnt++ % 2) == 0)
			_pack_start_clear();
		(void) _attempt_backfill();
		last_backfill_time = time(NULL);
		(void) bb_g_job_try_stage_in();
		unlock_slurmctld(all_locks);
		short_sleep = false;
	}
	FREE_NULL_LIST(pack_job_list);

	return NULL;
}

/* Clear the start_time for all pending jobs. This is used to ensure that a job which
 * can run in multiple partitions has its start_time set to the smallest
 * value in any of those partitions. */
static int _clear_job_start_times(void *x, void *arg)
{
	struct job_record *job_ptr = (struct job_record *) x;
	if (IS_JOB_PENDING(job_ptr))
		job_ptr->start_time = 0;
	return SLURM_SUCCESS;
}

/* Return non-zero to break the backfill loop if change in job, node or
 * partition state or the backfill scheduler needs to be stopped. */
static int _yield_locks(int usec)
{
	slurmctld_lock_t all_locks = {
		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
	time_t job_update, node_update, part_update;
	bool load_config = false;
	int max_rpc_cnt;

	max_rpc_cnt = MAX((defer_rpc_cnt / 10), 20);
	job_update  = last_job_update;
	node_update = last_node_update;
	part_update = last_part_update;

	unlock_slurmctld(all_locks);
	while (!stop_backfill) {
		bf_sleep_usec += _my_sleep(usec);
		if ((defer_rpc_cnt == 0) ||
		    (slurmctld_config.server_thread_count <= max_rpc_cnt))
			break;
		verbose("backfill: continuing to yield locks, %d RPCs pending",
			slurmctld_config.server_thread_count);
	}
	lock_slurmctld(all_locks);
	slurm_mutex_lock(&config_lock);
	if (config_flag)
		load_config = true;
	slurm_mutex_unlock(&config_lock);

	if ((last_job_update  == job_update)  &&
	    (last_node_update == node_update) &&
	    (last_part_update == part_update) &&
	    (! stop_backfill) && (! load_config))
		return 0;
	else
		return 1;
}

/* Test if this job still has access to the specified partition. The job's
 * available partitions may have changed when locks were released */
static bool _job_part_valid(struct job_record *job_ptr,
			    struct part_record *part_ptr)
{
	struct part_record *avail_part_ptr;
	ListIterator part_iterator;
	bool rc = false;

	if (job_ptr->part_ptr_list) {
		part_iterator = list_iterator_create(job_ptr->part_ptr_list);
		while ((avail_part_ptr = (struct part_record *)
				list_next(part_iterator))) {
			if (avail_part_ptr == part_ptr) {
				rc = true;
				break;
			}
		}
		list_iterator_destroy(part_iterator);
	} else if (job_ptr->part_ptr == part_ptr) {
		rc = true;
	}

	return rc;
}

/* Determine if job in the backfill queue is still runnable.
 * Job state could change when lock are periodically released */
static bool _job_runnable_now(struct job_record *job_ptr)
{
	uint16_t cleaning = 0;

	if (!IS_JOB_PENDING(job_ptr))	/* Started in other partition */
		return false;
	if (job_ptr->priority == 0)	/* Job has been held */
		return false;
	if (IS_JOB_COMPLETING(job_ptr))	/* Started, requeue and completing */
		return false;
	select_g_select_jobinfo_get(job_ptr->select_jobinfo,
				    SELECT_JOBDATA_CLEANING, &cleaning);
	if (cleaning)			/* Started, requeue and completing */
		return false;

	return true;
}

static int _attempt_backfill(void)
{
	DEF_TIMERS;
	List job_queue;
	job_queue_rec_t *job_queue_rec;
	int bb, i, j, k, node_space_recs, mcs_select = 0;
	slurmdb_qos_rec_t *qos_ptr = NULL;
	struct job_record *job_ptr;
	struct part_record *part_ptr, **bf_part_ptr = NULL;
	uint32_t end_time, end_reserve, deadline_time_limit, boot_time;
	uint32_t orig_end_time;
	uint32_t time_limit, comp_time_limit, orig_time_limit, part_time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *active_bitmap = NULL, *avail_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL, *resv_bitmap = NULL;
	time_t now, sched_start, later_start, start_res, resv_end, window_end;
	time_t pack_time, orig_sched_start, orig_start_time = (time_t) 0;
	node_space_map_t *node_space;
	user_part_rec_t *bf_user_part_ptr = NULL;
	struct timeval bf_time1, bf_time2;
	int rc = 0, error_code;
	int job_test_count = 0, test_time_count = 0, pend_time;
	uint32_t *uid = NULL, nuser = 0, bf_parts = 0;
	uint32_t *bf_part_jobs = NULL, *bf_part_resv = NULL;
	uint16_t *njobs = NULL;
	bool already_counted;
	uint32_t reject_array_job_id = 0;
	struct part_record *reject_array_part = NULL;
	uint32_t job_start_cnt = 0, start_time;
	time_t config_update = slurmctld_conf.last_update;
	time_t part_update = last_part_update;
	struct timeval start_tv;
	uint32_t test_array_job_id = 0;
	uint32_t test_array_count = 0;
	uint32_t job_no_reserve;
	bool resv_overlap = false;
	uint8_t save_share_res, save_whole_node;
	int test_fini;
	int user_part_inx1 = -1, user_part_inx2 = -1;
	int part_inx = -1, user_inx = -1;
	uint32_t qos_flags = 0;
	time_t qos_blocked_until = 0, qos_part_blocked_until = 0;
	/* QOS Read lock */
	assoc_mgr_lock_t qos_read_lock =
		{ NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK,
		  NO_LOCK, NO_LOCK, NO_LOCK };

	bf_sleep_usec = 0;

	if (!fed_mgr_sibs_synced()) {
		debug("backfill: %s returning, federation siblings not synced yet",
		      __func__);
		return SLURM_SUCCESS;
	}

#ifdef HAVE_ALPS_CRAY
	/*
	 * Run a Basil Inventory immediately before setting up the schedule
	 * plan, to avoid race conditions caused by ALPS node state change.
	 * Needs to be done with the node-state lock taken.
	 */
	START_TIMER;
	if (select_g_update_block(NULL)) {
		debug4("backfill: not scheduling due to ALPS");
		return SLURM_SUCCESS;
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: ALPS inventory completed, %s", TIME_STR);

	/* The Basil inventory can take a long time to complete. Process
	 * pending RPCs before starting the backfill scheduling logic */
	_yield_locks(1000000);
	if (stop_backfill)
		return SLURM_SUCCESS;
#endif
	(void) bb_g_load_state(false);

	START_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: beginning");
	else
		debug("backfill: beginning");
	sched_start = orig_sched_start = now = time(NULL);
	gettimeofday(&start_tv, NULL);

	job_queue = build_job_queue(true, true);
	job_test_count = list_count(job_queue);
	if (job_test_count == 0) {
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			info("backfill: no jobs to backfill");
		else
			debug("backfill: no jobs to backfill");
		FREE_NULL_LIST(job_queue);
		return 0;
	} else {
		debug("backfill: %u jobs to backfill", job_test_count);
		job_test_count = 0;
	}

	if (backfill_continue)
		list_for_each(job_list, _clear_job_start_times, NULL);

	gettimeofday(&bf_time1, NULL);

	slurmctld_diag_stats.bf_queue_len = list_count(job_queue);
	slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
						 bf_queue_len;
	slurmctld_diag_stats.bf_last_depth = 0;
	slurmctld_diag_stats.bf_last_depth_try = 0;
	slurmctld_diag_stats.bf_when_last_cycle = now;
	slurmctld_diag_stats.bf_active = 1;

	node_space = xmalloc(sizeof(node_space_map_t) *
			     (max_backfill_job_cnt * 2 + 1));
	node_space[0].begin_time = sched_start;
	window_end = sched_start + backfill_window;
	node_space[0].end_time = window_end;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
	if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
		_dump_node_space_table(node_space);

	if (bf_job_part_count_reserve || max_backfill_job_per_part) {
		ListIterator part_iterator;
		struct part_record *part_ptr;
		bf_parts = list_count(part_list);
		bf_part_ptr  = xmalloc(sizeof(struct part_record *) * bf_parts);
		bf_part_jobs = xmalloc(sizeof(uint32_t) * bf_parts);
		bf_part_resv = xmalloc(sizeof(uint32_t) * bf_parts);
		part_iterator = list_iterator_create(part_list);
		i = 0;
		while ((part_ptr = (struct part_record *)
				   list_next(part_iterator))) {
			bf_part_ptr[i++] = part_ptr;
		}
		list_iterator_destroy(part_iterator);
	}
	if (max_backfill_job_per_user || max_backfill_job_per_assoc) {
		uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
	}

	if (max_backfill_job_per_user_part) {
		ListIterator part_iterator;
		struct part_record *part_ptr;
		bf_parts = list_count(part_list);
		bf_user_part_ptr = xmalloc(sizeof(user_part_rec_t) * bf_parts);
		part_iterator = list_iterator_create(part_list);
		i = 0;
		while ((part_ptr = (struct part_record *)
				   list_next(part_iterator))) {
			bf_user_part_ptr[i].part_ptr = part_ptr;
			bf_user_part_ptr[i].njobs =
				xmalloc(BF_MAX_USERS * sizeof(uint16_t));
			bf_user_part_ptr[i++].uid =
				xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		}
		list_iterator_destroy(part_iterator);
	}

	if (assoc_limit_stop) {
		assoc_mgr_lock(&qos_read_lock);
		list_for_each(assoc_mgr_qos_list,
			      _clear_qos_blocked_times, NULL);
		assoc_mgr_unlock(&qos_read_lock);
	}

	sort_job_queue(job_queue);
	while (1) {
		uint32_t bf_job_id, bf_array_task_id, bf_job_priority;

		job_queue_rec = (job_queue_rec_t *) list_pop(job_queue);
		if (!job_queue_rec) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: reached end of job queue");
			break;
		}

		job_ptr          = job_queue_rec->job_ptr;
		part_ptr         = job_queue_rec->part_ptr;
		bf_job_id        = job_queue_rec->job_id;
		bf_job_priority  = job_queue_rec->priority;
		bf_array_task_id = job_queue_rec->array_task_id;
		xfree(job_queue_rec);

		if (slurmctld_config.shutdown_time ||
		    (difftime(time(NULL),orig_sched_start) >= bf_max_time)){
			break;
		}
		if (((defer_rpc_cnt > 0) &&
		     (slurmctld_config.server_thread_count >= defer_rpc_cnt)) ||
		    (slurm_delta_tv(&start_tv) >= sched_timeout)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: yielding locks after testing "
				     "%u(%d) jobs, %s",
				     slurmctld_diag_stats.bf_last_depth,
				     job_test_count, TIME_STR);
			}
			if ((_yield_locks(yield_sleep) && !backfill_continue) ||
			    (slurmctld_conf.last_update != config_update) ||
			    (last_part_update != part_update)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing "
					     "%u(%d) jobs",
					     slurmctld_diag_stats.bf_last_depth,
					     job_test_count);
				}
				rc = 1;
				break;
			}
			if (stop_backfill)
				break;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			gettimeofday(&start_tv, NULL);
			job_test_count = 0;
			test_time_count = 0;
			START_TIMER;
		}

		/* With bf_continue configured, the original job could have
		 * been cancelled and purged. Validate pointer here. */
		if ((job_ptr->magic  != JOB_MAGIC) ||
		    (job_ptr->job_id != bf_job_id)) {
			continue;
		}
		if ((job_ptr->array_task_id != bf_array_task_id) &&
		    (bf_array_task_id == NO_VAL)) {
			/* Job array element started in other partition,
			 * reset pointer to "master" job array record */
			job_ptr = find_job_record(job_ptr->array_job_id);
			if (!job_ptr)	/* All task array elements started */
				continue;
		}

		/*
		 * Establish baseline (worst case) start time for pack job
		 * Update time once start time estimate established
		 */
		_pack_start_set(job_ptr, (now + YEAR_SECONDS), NO_VAL);

		if (!_job_runnable_now(job_ptr))
			continue;

		job_ptr->last_sched_eval = now;
		job_ptr->part_ptr = part_ptr;
		job_ptr->priority = bf_job_priority;
		mcs_select = slurm_mcs_get_select(job_ptr);
		pack_time = _pack_start_find(job_ptr, now);
		if (pack_time > (now + backfill_window))
			continue;

		if (job_ptr->state_reason == FAIL_ACCOUNT) {
			slurmdb_assoc_rec_t assoc_rec;
			memset(&assoc_rec, 0, sizeof(slurmdb_assoc_rec_t));
			assoc_rec.acct      = job_ptr->account;
			if (job_ptr->part_ptr)
				assoc_rec.partition = job_ptr->part_ptr->name;
			assoc_rec.uid       = job_ptr->user_id;

			if (!assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
						    accounting_enforce,
						     &job_ptr->assoc_ptr,
						     false)) {
				job_ptr->state_reason = WAIT_NO_REASON;
				xfree(job_ptr->state_desc);
				job_ptr->assoc_id = assoc_rec.id;
				last_job_update = now;
			} else {
				debug("backfill: JobId=%u has invalid association",
				      job_ptr->job_id);
				xfree(job_ptr->state_desc);
				job_ptr->state_reason =
					WAIT_ASSOC_RESOURCE_LIMIT;
				continue;
			}
		}

		if (job_ptr->qos_id) {
			assoc_mgr_lock_t locks = {
				READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK,
				NO_LOCK, NO_LOCK, NO_LOCK };

			assoc_mgr_lock(&locks);
			if (job_ptr->assoc_ptr
			    && (accounting_enforce & ACCOUNTING_ENFORCE_QOS)
			    && ((job_ptr->qos_id >= g_qos_count) ||
				!bit_test(job_ptr->assoc_ptr->usage->valid_qos,
					  job_ptr->qos_id))
			    && !job_ptr->limit_set.qos) {
				debug("backfill: JobId=%u has invalid QOS",
				      job_ptr->job_id);
				xfree(job_ptr->state_desc);
				job_ptr->state_reason = FAIL_QOS;
				last_job_update = now;
				assoc_mgr_unlock(&locks);
				continue;
			} else if (job_ptr->state_reason == FAIL_QOS) {
				xfree(job_ptr->state_desc);
				job_ptr->state_reason = WAIT_NO_REASON;
				last_job_update = now;
			}
			assoc_mgr_unlock(&locks);
		}

		assoc_mgr_lock(&qos_read_lock);
		if (job_ptr->qos_ptr) {
			qos_flags = job_ptr->qos_ptr->flags;
			qos_blocked_until = job_ptr->qos_ptr->blocked_until;
		} else {
			qos_flags = 0;
			qos_blocked_until = 0;
		}

		if (job_ptr->part_ptr->qos_ptr)
			qos_part_blocked_until =
				job_ptr->part_ptr->qos_ptr->blocked_until;
		else
			qos_part_blocked_until = 0;

		if (part_policy_valid_qos(job_ptr->part_ptr,
					  job_ptr->qos_ptr) != SLURM_SUCCESS) {
			assoc_mgr_unlock(&qos_read_lock);
			xfree(job_ptr->state_desc);
			job_ptr->state_reason = WAIT_QOS;
			last_job_update = now;
			continue;
		}
		assoc_mgr_unlock(&qos_read_lock);

		if (!assoc_limit_stop &&
		    !acct_policy_job_runnable_pre_select(job_ptr)) {
			continue;
		}

		job_no_reserve = 0;
		if (bf_min_prio_reserve &&
		    (job_ptr->priority < bf_min_prio_reserve)) {
			job_no_reserve = TEST_NOW_ONLY;
		} else if (bf_min_age_reserve && job_ptr->details->begin_time) {
			pend_time = difftime(time(NULL),
					     job_ptr->details->begin_time);
			if (pend_time < bf_min_age_reserve)
				job_no_reserve = TEST_NOW_ONLY;
		}

		if ((job_no_reserve == 0) && bf_job_part_count_reserve) {
			for (j = 0; j < bf_parts; j++) {
				if (bf_part_ptr[j] != job_ptr->part_ptr)
					continue;
				if (bf_part_resv[j] >=
				    bf_job_part_count_reserve)
					job_no_reserve = TEST_NOW_ONLY;
				break;
			}
		}

		orig_start_time = job_ptr->start_time;
		orig_time_limit = job_ptr->time_limit;

next_task:
		job_test_count++;
		slurmctld_diag_stats.bf_last_depth++;
		already_counted = false;

		if (!IS_JOB_PENDING(job_ptr) ||	/* Started in other partition */
		    (job_ptr->priority == 0))	/* Job has been held */
			continue;
		if (job_ptr->preempt_in_progress)
			continue; 	/* scheduled in another partition */
		if (!avail_front_end(job_ptr))
			continue;	/* No available frontend for this job */
		if (!_job_part_valid(job_ptr, part_ptr))
			continue;	/* Partition change during lock yield */
		if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
			if ((reject_array_job_id == job_ptr->array_job_id) &&
			    (reject_array_part   == part_ptr))
				continue;  /* already rejected array element */

			/* assume reject whole array for now, clear if OK */
			reject_array_job_id = job_ptr->array_job_id;
			reject_array_part   = part_ptr;

			if (!job_array_start_test(job_ptr))
				continue;
		}
		job_ptr->part_ptr = part_ptr;

		if (debug_flags & DEBUG_FLAG_BACKFILL) {
			char job_id_str[64];
			info("backfill test for %s Prio=%u Partition=%s",
			     jobid2fmt(job_ptr, job_id_str, sizeof(job_id_str)),
			     job_ptr->priority, job_ptr->part_ptr->name);
		}

		/* Test to see if we've exceeded any per user/partition limit */
		if (max_backfill_job_per_user_part) {
			bool skip_job = false;
			for (j = 0; j < bf_parts; j++) {
				if (bf_user_part_ptr[j].part_ptr !=
				    job_ptr->part_ptr)
					continue;
				for (k = 0; k < bf_user_part_ptr[j].user_cnt;
				     k++) {
					if (bf_user_part_ptr[j].uid[k] !=
					    job_ptr->user_id)
						continue;
					user_part_inx1 = j;
					user_part_inx2 = k;
					if ((bf_user_part_ptr[j].njobs[k] + 1)
					    > max_backfill_job_per_user_part)
						skip_job = true;
					break;
				}
				if ((k == bf_user_part_ptr[j].user_cnt) &&
				    (k < BF_MAX_USERS)) {
					bf_user_part_ptr[j].user_cnt++;
					bf_user_part_ptr[j].uid[k] =
						job_ptr->user_id;
					user_part_inx1 = j;
					user_part_inx2 = k;
				}
				break;
			}
			if (skip_job) {
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					info("backfill: have already "
					     "checked %u jobs for user %u on "
					     "partition %s; skipping "
					     "job %u",
					     max_backfill_job_per_user_part,
					     job_ptr->user_id,
					     job_ptr->part_ptr->name,
					     job_ptr->job_id);
				continue;
			}
		}
		if (max_backfill_job_per_part) {
			bool skip_job = false;
			for (j = 0; j < bf_parts; j++) {
				if (bf_part_ptr[j] != job_ptr->part_ptr)
					continue;
				part_inx = j;
				if ((bf_part_jobs[j] + 1) >
				    max_backfill_job_per_part)
					skip_job = true;
				break;
			}
			if (skip_job) {
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					info("backfill: have already "
					     "checked %u jobs for "
					     "partition %s; skipping "
					     "job %u",
					     max_backfill_job_per_part,
					     job_ptr->part_ptr->name,
					     job_ptr->job_id);
				continue;
			}
		}

		if (max_backfill_job_per_assoc) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->assoc_id == uid[j]) {
					njobs[j]++;
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: user %u assoc %u: #jobs %u",
						      job_ptr->user_id,
						      uid[j], njobs[j]);
					break;
				}
			}
			if (j == nuser) { /* assoc not found */
				static bool bf_max_user_msg = true;
				if (nuser < BF_MAX_USERS) {
					uid[j] = job_ptr->assoc_id;
					njobs[j] = 1;
					nuser++;
				} else if (bf_max_user_msg) {
					bf_max_user_msg = false;
					error("backfill: too many associations in queue. Conside increasing BF_MAX_USERS from %u (g_user_assoc_count=%u)",
					      nuser, g_user_assoc_count);
				}
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					debug2("backfill: found new user/assoc %u/%u.  Total #users/assoc now %u",
					       job_ptr->user_id,
					       job_ptr->assoc_id, nuser);
			} else {
				if (njobs[j] >= max_backfill_job_per_assoc) {
					/* skip job */
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						info("backfill: have already checked %u jobs for user %u, assoc %u; skipping job %u",
						     max_backfill_job_per_assoc,
						     job_ptr->user_id,
						     job_ptr->assoc_id,
						     job_ptr->job_id);
					continue;
				}
			}
		}

		if (max_backfill_job_per_user) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->user_id == uid[j]) {
					user_inx = j;
					if (debug_flags & DEBUG_FLAG_BACKFILL) {
						debug("backfill: user %u: "
						      "#jobs %u",
						      uid[j], njobs[j]);
					}
					break;
				}
			}
			if (j == nuser) { /* user not found */
				static bool bf_max_user_msg = true;
				if (nuser < BF_MAX_USERS) {
					user_inx = j;
					uid[j] = job_ptr->user_id;
					nuser++;
				} else if (bf_max_user_msg) {
					bf_max_user_msg = false;
					error("backfill: too many users in "
					      "queue. Consider increasing "
					      "BF_MAX_USERS");
				}
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					debug2("backfill: found new user %u. "
					       "Total #users now %u",
					       job_ptr->user_id, nuser);
				}
			} else {
				if ((njobs[j] + 1) > max_backfill_job_per_user){
					/* skip job */
					if (debug_flags & DEBUG_FLAG_BACKFILL) {
						info("backfill: have already "
						     "checked %u jobs for "
						     "user %u; skipping "
						     "job %u",
						     max_backfill_job_per_user,
						     job_ptr->user_id,
						     job_ptr->job_id);
					}
					continue;
				}
			}
		}

		/* Increment our user/partition limit counters as needed */
		if (max_backfill_job_per_user_part &&
		    (user_part_inx1 != -1) && (user_part_inx2 != -1)) {
			bf_user_part_ptr[user_part_inx1].
				njobs[user_part_inx2]++;
		}
		if (max_backfill_job_per_part && (part_inx != -1))
			bf_part_jobs[part_inx]++;
		if (max_backfill_job_per_user && (user_inx != -1))
			njobs[user_inx]++;

		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
		    (part_ptr->node_bitmap == NULL)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: partition %s not usable",
				     job_ptr->part_ptr->name);
			continue;
		}

		if ((!job_independent(job_ptr, 0)) ||
		    (license_job_test(job_ptr, time(NULL), true) !=
		     SLURM_SUCCESS)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u not runable now",
				     job_ptr->job_id);
			continue;
		}

		/* Determine minimum and maximum node counts */
		error_code = get_node_cnts(job_ptr, qos_flags, part_ptr,
					   &min_nodes, &req_nodes, &max_nodes);

		if (error_code == ESLURM_ACCOUNTING_POLICY) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u acct policy node limit",
				     job_ptr->job_id);
			continue;
		} else if (error_code ==
			   ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u node count too high",
				     job_ptr->job_id);
			continue;
		} else if (error_code != SLURM_SUCCESS) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: error setting nodes for job %u: %s",
				     job_ptr->job_id,
				     slurm_strerror(error_code));
			continue;
		}

		/* test of deadline */
		now = time(NULL);
		deadline_time_limit = 0;
		if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) {
			if (!deadline_ok(job_ptr, "backfill"))
				continue;

			deadline_time_limit = (job_ptr->deadline - now) / 60;
		}

		/* Determine job's expected completion time */
		if (part_ptr->max_time == INFINITE)
			part_time_limit = YEAR_MINUTES;
		else
			part_time_limit = part_ptr->max_time;
		if ((job_ptr->time_limit == NO_VAL) ||
		    (job_ptr->time_limit == INFINITE)) {
			time_limit = part_time_limit;
			job_ptr->limit_set.time = 1;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_time_limit);
		}
		if (deadline_time_limit)
			comp_time_limit = MIN(time_limit, deadline_time_limit);
		else
			comp_time_limit = time_limit;
		if ((qos_flags & QOS_FLAG_NO_RESERVE) &&
		    slurm_get_preempt_mode())
			time_limit = job_ptr->time_limit = 1;
		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
			time_limit = job_ptr->time_limit = job_ptr->time_min;

		later_start = now;

		if (assoc_limit_stop) {
			if (qos_blocked_until > later_start) {
				later_start = qos_blocked_until;
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					info("QOS blocked_until move start_res to %ld",
					     later_start);
			}
			if (qos_part_blocked_until > later_start) {
				later_start = qos_part_blocked_until;
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					info("Part QOS blocked_until move start_res to %ld",
					     later_start);
			}
		}

 TRY_LATER:
		if (slurmctld_config.shutdown_time ||
		    (difftime(time(NULL), orig_sched_start) >=
		     bf_max_time)) {
			_set_job_time_limit(job_ptr, orig_time_limit);
			break;
		}
		test_time_count++;
		if (((defer_rpc_cnt > 0) &&
		     (slurmctld_config.server_thread_count >= defer_rpc_cnt)) ||
		    (slurm_delta_tv(&start_tv) >= sched_timeout)) {
			uint32_t save_job_id = job_ptr->job_id;
			uint32_t save_time_limit = job_ptr->time_limit;
			_set_job_time_limit(job_ptr, orig_time_limit);
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: yielding locks after testing "
				     "%u(%d) jobs tested, %u time slots, %s",
				     slurmctld_diag_stats.bf_last_depth,
				     job_test_count, test_time_count, TIME_STR);
			}
			if ((_yield_locks(yield_sleep) && !backfill_continue) ||
			    (slurmctld_conf.last_update != config_update) ||
			    (last_part_update != part_update)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing "
					     "%u(%d) jobs",
					     slurmctld_diag_stats.bf_last_depth,
					     job_test_count);
				}
				rc = 1;
				break;
			}
			if (stop_backfill)
				break;

			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			gettimeofday(&start_tv, NULL);
			job_test_count = 1;
			test_time_count = 0;
			START_TIMER;

			/* With bf_continue configured, the original job could
			 * have been scheduled or cancelled and purged.
			 * Revalidate job the record here. */
			if ((job_ptr->magic  != JOB_MAGIC) ||
			    (job_ptr->job_id != save_job_id))
				continue;
			if (!_job_runnable_now(job_ptr))
				continue;
			if (!avail_front_end(job_ptr))
				continue;	/* No available frontend */
			if (!job_independent(job_ptr, 0)) {
				/* No longer independent
				 * (e.g. another singleton started) */
				continue;
			}

			job_ptr->time_limit = save_time_limit;
			job_ptr->part_ptr = part_ptr;
		}

		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		start_res = MAX(later_start, pack_time);
		resv_end = 0;
		later_start = 0;
		/* Determine impact of any advance reservations */
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				  &exc_core_bitmap, &resv_overlap, false);
		if (j != SLURM_SUCCESS) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u reservation defer",
				     job_ptr->job_id);
			_set_job_time_limit(job_ptr, orig_time_limit);
			continue;
		}
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;
		if (end_time < now)	/* Overflow 32-bits */
			end_time = INFINITE;
		if (resv_overlap)
			resv_end = find_resv_end(start_res);
		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		filter_by_node_owner(job_ptr, avail_bitmap);
		filter_by_node_mcs(job_ptr, mcs_select, avail_bitmap);
		for (j = 0; ; ) {
			if ((node_space[j].end_time > start_res) &&
			     node_space[j].next && (later_start == 0))
				later_start = node_space[j].end_time;
			if (node_space[j].end_time <= start_res)
				;
			else if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap,
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}
		if (resv_end && (++resv_end < window_end) &&
		    ((later_start == 0) || (resv_end < later_start))) {
			later_start = resv_end;
		}

		if (job_ptr->details->exc_node_bitmap) {
			bit_and_not(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
		}

		/* Test if insufficient nodes remain OR
		 *	required nodes missing OR
		 *	nodes lack features OR
		 *	no change since previously tested nodes (only changes
		 *	in other partition nodes) */
		if ((bit_set_count(avail_bitmap) < min_nodes) ||
		    ((job_ptr->details->req_node_bitmap) &&
		     (!bit_super_set(job_ptr->details->req_node_bitmap,
				     avail_bitmap))) ||
		    (job_req_node_filter(job_ptr, avail_bitmap, true))) {
			if (later_start) {
				job_ptr->start_time = 0;
				goto TRY_LATER;
			}

			/* Job can not start until too far in the future */
			_set_job_time_limit(job_ptr, orig_time_limit);
			job_ptr->start_time = 0;
			if ((orig_start_time != 0) &&
			    (orig_start_time < job_ptr->start_time)) {
				/* Can start earlier in different partition */
				job_ptr->start_time = orig_start_time;
			}
			continue;
		}

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		/* this is the time consuming operation */
		debug2("backfill: entering _try_sched for job %u.",
		       job_ptr->job_id);

		if (!already_counted) {
			slurmctld_diag_stats.bf_last_depth_try++;
			already_counted = true;
		}
		if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
			_dump_job_test(job_ptr, avail_bitmap, start_res);
		test_fini = -1;
		build_active_feature_bitmap(job_ptr, avail_bitmap,
					    &active_bitmap);
		job_ptr->bit_flags |= BACKFILL_TEST;
		job_ptr->bit_flags |= job_no_reserve;	/* 0 or TEST_NOW_ONLY */
		if (active_bitmap) {
			j = _try_sched(job_ptr, &active_bitmap, min_nodes,
				       max_nodes, req_nodes, exc_core_bitmap);
			if (j == SLURM_SUCCESS) {
				FREE_NULL_BITMAP(avail_bitmap);
				avail_bitmap = active_bitmap;
				active_bitmap = NULL;
				test_fini = 1;
			} else {
				FREE_NULL_BITMAP(active_bitmap);
				save_share_res  = job_ptr->details->share_res;
				save_whole_node = job_ptr->details->whole_node;
				job_ptr->details->share_res = 0;
				job_ptr->details->whole_node = 1;
				test_fini = 0;
			}
		}
		boot_time = 0;
		if (test_fini == 0) {
			/* Unable to start job using currently active features,
			 * need to try using features which can be made
			 * available after node reboot */
			bitstr_t *tmp_core_bitmap = NULL;
			bitstr_t *tmp_node_bitmap = NULL;
			debug2("backfill: entering _try_sched for job %u. "
			       "Need to use features which can be made "
			       "available after node reboot", job_ptr->job_id);
			/* Determine impact of any advance reservations */
			resv_end = 0;
			j = job_test_resv(job_ptr, &start_res, false,
					  &tmp_node_bitmap, &tmp_core_bitmap,
					  &resv_overlap, true);
			if (resv_overlap)
				resv_end = find_resv_end(start_res);
			if (resv_end && (++resv_end < window_end) &&
			    ((later_start == 0) || (resv_end < later_start))) {
				later_start = resv_end;
			}
			if (j == SLURM_SUCCESS) {
				FREE_NULL_BITMAP(exc_core_bitmap);
				exc_core_bitmap = tmp_core_bitmap;
				bit_and(avail_bitmap, tmp_node_bitmap);
				FREE_NULL_BITMAP(tmp_node_bitmap);
			}
			boot_time = node_features_g_boot_time();
			orig_end_time = end_time;
			end_time += boot_time;

			for (j = 0; ; ) {
				if (node_space[j].end_time <= start_res)
					;
				else if (node_space[j].begin_time <= end_time) {
					if (node_space[j].begin_time >
					    orig_end_time)
						bit_and(avail_bitmap,
						node_space[j].avail_bitmap);
				} else
					break;
				if ((j = node_space[j].next) == 0)
					break;
			}
		}
		if (test_fini != 1) {
			/* Either active_bitmap was NULL or not usable by the
			 * job. Test using avail_bitmap instead */
			j = _try_sched(job_ptr, &avail_bitmap, min_nodes,
				       max_nodes, req_nodes, exc_core_bitmap);
			if (test_fini == 0) {
				job_ptr->details->share_res = save_share_res;
				job_ptr->details->whole_node = save_whole_node;
			}
		}
		job_ptr->bit_flags &= ~BACKFILL_TEST;
		job_ptr->bit_flags &= ~TEST_NOW_ONLY;

		now = time(NULL);
		if (j != SLURM_SUCCESS) {
			_set_job_time_limit(job_ptr, orig_time_limit);
			if (orig_start_time != 0)  /* Can start in other part */
				job_ptr->start_time = orig_start_time;
			else
				job_ptr->start_time = 0;
			continue;	/* not runable in this partition */
		}

		if (start_res > job_ptr->start_time) {
			job_ptr->start_time = start_res;
			last_job_update = now;
		}
		if ((job_ptr->start_time <= now) &&
		    (bit_overlap(avail_bitmap, cg_node_bitmap) > 0)) {
			/* Need to wait for in-progress completion/epilog */
			job_ptr->start_time = now + 1;
			later_start = 0;
		}
		if ((job_ptr->start_time <= now) &&
		    ((bb = bb_g_job_test_stage_in(job_ptr, true)) != 1)) {
			if (job_ptr->state_reason != WAIT_NO_REASON) {
				;
			} else if (bb == -1) {
				xfree(job_ptr->state_desc);
				job_ptr->state_reason =
					WAIT_BURST_BUFFER_RESOURCE;
				job_ptr->start_time =
					bb_g_job_get_est_start(job_ptr);
			} else {	/* bb == 0 */
				xfree(job_ptr->state_desc);
				job_ptr->state_reason=WAIT_BURST_BUFFER_STAGING;
				job_ptr->start_time = now + 1;
			}
			debug3("sched: JobId=%u. State=%s. Reason=%s. "
			       "Priority=%u.",
			       job_ptr->job_id,
			       job_state_string(job_ptr->job_state),
			       job_reason_string(job_ptr->state_reason),
			       job_ptr->priority);
			last_job_update = now;
			_set_job_time_limit(job_ptr, orig_time_limit);
			later_start = 0;
			if (bb == -1)
				continue;
		} else if ((job_ptr->pack_job_id == 0) &&
			   (job_ptr->start_time <= now)) { /* Can start now */
			uint32_t save_time_limit = job_ptr->time_limit;
			uint32_t hard_limit;
			bool reset_time = false;
			int rc;

			/* get fed job lock from origin cluster */
			if (fed_mgr_job_lock(job_ptr)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					info("backfill: JobId=%u can't get fed job lock from origin cluster to backfill job",
					     job_ptr->job_id);
				rc = ESLURM_FED_JOB_LOCK;
				goto skip_start;
			}

			rc = _start_job(job_ptr, resv_bitmap);

			if (rc == SLURM_SUCCESS) {
				/* If the following fails because of network
				 * connectivity, the origin cluster should ask
				 * when it comes back up if the cluster_lock
				 * cluster actually started the job */
				fed_mgr_job_start(job_ptr, job_ptr->start_time);
			} else {
				fed_mgr_job_unlock(job_ptr);
			}

skip_start:
			if (qos_flags & QOS_FLAG_NO_RESERVE) {
				if (orig_time_limit == NO_VAL) {
					acct_policy_alter_job(
						job_ptr, comp_time_limit);
					job_ptr->time_limit = comp_time_limit;
					job_ptr->limit_set.time = 1;
				} else {
					acct_policy_alter_job(
						job_ptr, orig_time_limit);
					_set_job_time_limit(job_ptr,
							    orig_time_limit);
				}
			} else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
				/* Set time limit as high as possible */
				acct_policy_alter_job(job_ptr, comp_time_limit);
				job_ptr->time_limit = comp_time_limit;
				reset_time = true;
			} else if (orig_time_limit == NO_VAL) {
				acct_policy_alter_job(job_ptr, comp_time_limit);
				job_ptr->time_limit = comp_time_limit;
				job_ptr->limit_set.time = 1;
			} else {
				acct_policy_alter_job(job_ptr, orig_time_limit);
				_set_job_time_limit(job_ptr, orig_time_limit);
			}
			/* Only set end_time if start_time is set,
			 * or else end_time will be small (ie. 1969). */
			if (job_ptr->start_time) {
				if (job_ptr->time_limit == INFINITE)
					hard_limit = YEAR_SECONDS;
				else
					hard_limit = job_ptr->time_limit * 60;
				job_ptr->end_time = job_ptr->start_time +
						    hard_limit;
				/* Only set if start_time. end_time must be set
				 * beforehand for _reset_job_time_limit. */
				if (reset_time) {
					_reset_job_time_limit(job_ptr, now,
							      node_space);
					time_limit = job_ptr->time_limit;
				}
			} else if (rc == SLURM_SUCCESS) {
				error("%s: start_time of 0 on successful "
				      "backfill. This shouldn't happen. :)",
				      __func__);
			}

			if ((rc == ESLURM_RESERVATION_BUSY) ||
			    (rc == ESLURM_ACCOUNTING_POLICY &&
			     !assoc_limit_stop) ||
			    (rc == ESLURM_POWER_NOT_AVAIL) ||
			    (rc == ESLURM_POWER_RESERVED)) {
				/* Unknown future start time, just skip job */
				if (orig_start_time != 0) {
					/* Can start in different partition */
					job_ptr->start_time = orig_start_time;
				} else
					job_ptr->start_time = 0;
				_set_job_time_limit(job_ptr, orig_time_limit);
				continue;
			} else if (rc == ESLURM_ACCOUNTING_POLICY) {
				/* Unknown future start time. Determining
				 * when it can start with certainty requires
				 * when every running and pending job starts
				 * and ends and tracking all of there resources.
				 * That requires very high overhead, that we
				 * don't want to add. Estimate that it can start
				 * after the next job ends (or in 5 minutes if
				 * we don't have that information yet). */
				if (later_start)
					job_ptr->start_time = later_start;
				else
					job_ptr->start_time = now + 500;
				if (job_ptr->qos_blocking_ptr &&
				    job_state_qos_grp_limit(
					    job_ptr->state_reason)) {
					assoc_mgr_lock(&qos_read_lock);
					qos_ptr = job_ptr->qos_blocking_ptr;
					if (qos_ptr->blocked_until <
					    job_ptr->start_time) {
						qos_ptr->blocked_until =
						job_ptr->start_time;
					}
					assoc_mgr_unlock(&qos_read_lock);
				}
			} else if (rc != SLURM_SUCCESS) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: planned start of job %u"
					     " failed: %s", job_ptr->job_id,
					     slurm_strerror(rc));
				}
				/* Drop through and reserve these resources.
				 * Likely due to state changes during sleep.
				 * Make best-effort based upon original state */
				_set_job_time_limit(job_ptr, orig_time_limit);
				later_start = 0;
			} else {
				/* Started this job, move to next one */
				reject_array_job_id = 0;
				reject_array_part   = NULL;

				/* Update the database if job time limit
				 * changed and move to next job */
				if (save_time_limit != job_ptr->time_limit)
					jobacct_storage_job_start_direct(
							acct_db_conn, job_ptr);
				job_start_cnt++;
				if (max_backfill_jobs_start &&
				    (job_start_cnt >= max_backfill_jobs_start)){
					if (debug_flags & DEBUG_FLAG_BACKFILL) {
						info("backfill: bf_max_job_start"
						     " limit of %d reached",
						     max_backfill_jobs_start);
					}
					break;
				}
				if (job_ptr->array_task_id != NO_VAL) {
					/* Try starting next task of job array */
					job_ptr = find_job_record(job_ptr->
								  array_job_id);
					if (job_ptr && IS_JOB_PENDING(job_ptr))
						goto next_task;
				}
				continue;
			}
		} else if (job_ptr->pack_job_id != 0) {
			uint32_t max_time_limit;
			max_time_limit =_get_job_max_tl(job_ptr, now,
						        node_space);
			comp_time_limit = MIN(comp_time_limit, max_time_limit);
			job_ptr->node_cnt_wag =
					MAX(bit_set_count(avail_bitmap), 1);
			_pack_start_set(job_ptr, job_ptr->start_time,
					comp_time_limit);
			_set_job_time_limit(job_ptr, orig_time_limit);
		}

		if ((job_ptr->start_time > now) && (job_no_reserve != 0)) {
			if ((orig_start_time != 0) &&
			    (orig_start_time < job_ptr->start_time)) {
				/* Can start earlier in different partition */
				job_ptr->start_time = orig_start_time;
			}
			continue;
		}

		if (later_start && (job_ptr->start_time > later_start)) {
			/* Try later when some nodes currently reserved for
			 * pending jobs are free */
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				info("backfill: Try later job %u later_start %ld",
			             job_ptr->job_id, later_start);
			}
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		start_time  = job_ptr->start_time;
		end_reserve = job_ptr->start_time + boot_time +
			      (time_limit * 60);
		start_time  = (start_time / backfill_resolution) *
			      backfill_resolution;
		end_reserve = (end_reserve / backfill_resolution) *
			      backfill_resolution;

		if (job_ptr->start_time > (sched_start + backfill_window)) {
			/* Starts too far in the future to worry about */
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				_dump_job_sched(job_ptr, end_reserve,
						avail_bitmap);
			if ((orig_start_time != 0) &&
			    (orig_start_time < job_ptr->start_time)) {
				/* Can start earlier in different partition */
				job_ptr->start_time = orig_start_time;
			}
			continue;
		}

		if (node_space_recs >= max_backfill_job_cnt) {
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				info("backfill: table size limit of %u reached",
				     max_backfill_job_cnt);
			}
			if ((max_backfill_job_per_part != 0) &&
			    (max_backfill_job_per_part >=
			     max_backfill_job_cnt)) {
				error("bf_max_job_part >= bf_max_job_test (%u >= %u)",
				      max_backfill_job_per_part,
				      max_backfill_job_cnt);
			} else if ((max_backfill_job_per_user != 0) &&
				   (max_backfill_job_per_user >
				    max_backfill_job_cnt)) {
				info("warning: bf_max_job_user > bf_max_job_test (%u > %u)",
				     max_backfill_job_per_user,
				     max_backfill_job_cnt);
			} else if  ((max_backfill_job_per_assoc != 0) &&
				    (max_backfill_job_per_assoc >
				     max_backfill_job_cnt)) {
				info("warning: bf_max_job_assoc > bf_max_job_test (%u > %u)",
				     max_backfill_job_per_assoc,
				     max_backfill_job_cnt);
			}
			break;
		}

		if ((job_ptr->start_time > now) &&
		    (job_ptr->state_reason != WAIT_BURST_BUFFER_RESOURCE) &&
		    (job_ptr->state_reason != WAIT_BURST_BUFFER_STAGING) &&
		    _test_resv_overlap(node_space, avail_bitmap,
				       start_time, end_reserve)) {
			/* This job overlaps with an existing reservation for
			 * job to be backfill scheduled, which the sched
			 * plugin does not know about. Try again later. */
			later_start = job_ptr->start_time;
			job_ptr->start_time = 0;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				info("backfill: Job %u overlaps with existing "
				     "reservation start_time=%u "
				     "end_reserve=%u boot_time=%u "
				     "later_start %ld", job_ptr->job_id,
				     start_time, end_reserve, boot_time,
				     later_start);
			}
			goto TRY_LATER;
		}

		/*
		 * Add reservation to scheduling table if appropriate
		 */
		if (!assoc_limit_stop) {
			uint32_t selected_node_cnt;
			uint64_t tres_req_cnt[slurmctld_tres_cnt];

			selected_node_cnt = bit_set_count(avail_bitmap);
			memcpy(tres_req_cnt, job_ptr->tres_req_cnt,
			       sizeof(tres_req_cnt));
			tres_req_cnt[TRES_ARRAY_CPU] =
				(uint64_t)(job_ptr->total_cpus ?
					   job_ptr->total_cpus :
					   job_ptr->details->min_cpus);

			tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(
						job_ptr->details->pn_min_memory,
						tres_req_cnt[TRES_ARRAY_CPU],
						selected_node_cnt);

			tres_req_cnt[TRES_ARRAY_NODE] =
				(uint64_t)selected_node_cnt;

			gres_set_job_tres_cnt(job_ptr->gres_list,
					      selected_node_cnt,
					      tres_req_cnt,
					      false);

			if (!acct_policy_job_runnable_post_select(job_ptr,
							  tres_req_cnt)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: adding reservation for "
					     "job %u blocked by "
					     "acct_policy_job_runnable_post_select",
					     job_ptr->job_id);
				}
				continue;
			}
		}
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_job_sched(job_ptr, end_reserve, avail_bitmap);
		if (qos_flags & QOS_FLAG_NO_RESERVE)
			continue;
		if (bf_job_part_count_reserve) {
			bool do_reserve = true;
			for (j = 0; j < bf_parts; j++) {
				if (bf_part_ptr[j] != job_ptr->part_ptr)
					continue;
				if (bf_part_resv[j]++ >=
				    bf_job_part_count_reserve)
					do_reserve = false;
				break;
			}
			if (!do_reserve)
				continue;
		}
		reject_array_job_id = 0;
		reject_array_part   = NULL;
		xfree(job_ptr->sched_nodes);
		job_ptr->sched_nodes = bitmap2node_name(avail_bitmap);
		bit_not(avail_bitmap);
		_add_reservation(start_time, end_reserve,
				 avail_bitmap, node_space, &node_space_recs);
		if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
			_dump_node_space_table(node_space);
		if ((orig_start_time != 0) &&
		    (orig_start_time < job_ptr->start_time)) {
			/* Can start earlier in different partition */
			job_ptr->start_time = orig_start_time;
		}
		if (job_ptr->array_recs) {
			/* Try making reservation for next task of job array */
			if (test_array_job_id != job_ptr->array_job_id) {
				test_array_job_id = job_ptr->array_job_id;
				test_array_count = 1;
			} else {
				test_array_count++;
			}
			if ((test_array_count < bf_max_job_array_resv) &&
			    (test_array_count < job_ptr->array_recs->task_cnt))
				goto next_task;
		}
	}

	_pack_start_test(node_space);

	xfree(bf_part_jobs);
	xfree(bf_part_resv);
	xfree(bf_part_ptr);
	xfree(uid);
	xfree(njobs);
	if (bf_user_part_ptr) {
		for (i = 0; i < bf_parts; i++) {
			xfree(bf_user_part_ptr[i].njobs);
			xfree(bf_user_part_ptr[i].uid);
		}
		xfree(bf_user_part_ptr);
	}
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	for (i = 0; ; ) {
		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(node_space);
	FREE_NULL_LIST(job_queue);

	gettimeofday(&bf_time2, NULL);
	_do_diag_stats(&bf_time1, &bf_time2);
	if (debug_flags & DEBUG_FLAG_BACKFILL) {
		END_TIMER;
		info("backfill: completed testing %u(%d) jobs, %s",
		     slurmctld_diag_stats.bf_last_depth,
		     job_test_count, TIME_STR);
	}
	if (slurmctld_config.server_thread_count >= 150) {
		info("backfill: %d pending RPCs at cycle end, consider "
		     "configuring max_rpc_cnt",
		     slurmctld_config.server_thread_count);
	}
	return rc;
}

/* Try to start the job on any non-reserved nodes */
static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap)
{
	int rc;
	bitstr_t *orig_exc_nodes = NULL;
	bool is_job_array_head = false;
	static uint32_t fail_jobid = 0;

	if (job_ptr->details->exc_node_bitmap) {
		orig_exc_nodes = bit_copy(job_ptr->details->exc_node_bitmap);
		bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap);
	} else
		job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap);
	if (job_ptr->array_recs)
		is_job_array_head = true;
	rc = select_nodes(job_ptr, false, NULL, NULL, NULL);
	if (is_job_array_head && job_ptr->details) {
		struct job_record *base_job_ptr;
		base_job_ptr = find_job_record(job_ptr->array_job_id);
		if (base_job_ptr && base_job_ptr != job_ptr
				 && base_job_ptr->array_recs) {
			FREE_NULL_BITMAP(
					base_job_ptr->details->exc_node_bitmap);
			if (orig_exc_nodes)
				base_job_ptr->details->exc_node_bitmap =
					bit_copy(orig_exc_nodes);
		}
	}
	if (job_ptr->details) { /* select_nodes() might reset exc_node_bitmap */
		FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
		job_ptr->details->exc_node_bitmap = orig_exc_nodes;
	} else
		FREE_NULL_BITMAP(orig_exc_nodes);
	if (rc == SLURM_SUCCESS) {
		/* job initiated */
		char job_id_str[64];
		last_job_update = time(NULL);
		info("backfill: Started %s in %s on %s",
		     jobid2fmt(job_ptr, job_id_str, sizeof(job_id_str)),
		     job_ptr->part_ptr->name, job_ptr->nodes);
		power_g_job_start(job_ptr);
		if (job_ptr->batch_flag == 0)
			srun_allocate(job_ptr->job_id);
		else if (
#ifdef HAVE_BG
			/*
			 * On a bluegene system we need to run the prolog
			 * while the job is CONFIGURING so this can't work
			 * off the CONFIGURING flag as done elsewhere.
			 */
			!job_ptr->details ||
			!job_ptr->details->prolog_running
#else
			!IS_JOB_CONFIGURING(job_ptr)
#endif
			)
			launch_job(job_ptr);
		slurmctld_diag_stats.backfilled_jobs++;
		slurmctld_diag_stats.last_backfilled_jobs++;
		if (debug_flags & DEBUG_FLAG_BACKFILL) {
			info("backfill: Jobs backfilled since boot: %u",
			     slurmctld_diag_stats.backfilled_jobs);
		}
	} else if ((job_ptr->job_id != fail_jobid) &&
		   (rc != ESLURM_ACCOUNTING_POLICY)) {
		char *node_list;
		bit_not(resv_bitmap);
		node_list = bitmap2node_name(resv_bitmap);
		/* This happens when a job has sharing disabled and
		 * a selected node is still completing some job,
		 * which should be a temporary situation. */
		verbose("backfill: Failed to start JobId=%u with %s avail: %s",
			job_ptr->job_id, node_list, slurm_strerror(rc));
		xfree(node_list);
		fail_jobid = job_ptr->job_id;
	} else {
		debug3("backfill: Failed to start JobId=%u: %s",
		       job_ptr->job_id, slurm_strerror(rc));
	}

	return rc;
}

/*
 * Compute a job's maximum time based upon conflicts in resources
 * planned for use by other jobs and that job's min/max time limit
 * Return NO_VAL if no restriction
 */
static uint32_t _get_job_max_tl(struct job_record *job_ptr, time_t now,
				node_space_map_t *node_space)
{
	int32_t j;
	time_t comp_time = 0;
	uint32_t max_tl = NO_VAL;

	if (job_ptr->time_min == 0)
		return max_tl;

	for (j = 0; ; ) {
		if ((node_space[j].begin_time != now) && // No current conflicts
		    (node_space[j].begin_time < job_ptr->end_time) &&
		    (!bit_super_set(job_ptr->node_bitmap,
				    node_space[j].avail_bitmap))) {
			/* Job overlaps pending job's resource reservation */
			if ((comp_time == 0) ||
			    (comp_time > node_space[j].begin_time))
				comp_time = node_space[j].begin_time;
		}
		if ((j = node_space[j].next) == 0)
			break;
	}

	if (comp_time != 0)
		max_tl = (comp_time - now + 59) / 60;

	return max_tl;
}

/*
 * Reset a job's time limit (and end_time) as high as possible
 *	within the range job_ptr->time_min and job_ptr->time_limit.
 *	Avoid using resources reserved for pending jobs or in resource
 *	reservations
 */
static void _reset_job_time_limit(struct job_record *job_ptr, time_t now,
				  node_space_map_t *node_space)
{
	int32_t j, resv_delay;
	uint32_t orig_time_limit = job_ptr->time_limit;
	uint32_t new_time_limit;

	for (j = 0; ; ) {
		if ((node_space[j].begin_time != now) && // No current conflicts
		    (node_space[j].begin_time < job_ptr->end_time) &&
		    (!bit_super_set(job_ptr->node_bitmap,
				    node_space[j].avail_bitmap))) {
			/* Job overlaps pending job's resource reservation */
			resv_delay = difftime(node_space[j].begin_time, now);
			resv_delay /= 60;	/* seconds to minutes */
			if (resv_delay < job_ptr->time_limit)
				job_ptr->time_limit = resv_delay;
		}
		if ((j = node_space[j].next) == 0)
			break;
	}
	new_time_limit = MAX(job_ptr->time_min, job_ptr->time_limit);
	acct_policy_alter_job(job_ptr, new_time_limit);
	job_ptr->time_limit = new_time_limit;
	job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60);

	job_time_adj_resv(job_ptr);

	if (orig_time_limit != job_ptr->time_limit) {
		info("backfill: job %u time limit changed from %u to %u",
		     job_ptr->job_id, orig_time_limit, job_ptr->time_limit);
	}
}

/* Report if any changes occurred to job, node or partition information */
static bool _more_work (time_t last_backfill_time)
{
	bool rc = false;

	slurm_mutex_lock( &thread_flag_mutex );
	if ( (last_job_update  >= last_backfill_time ) ||
	     (last_node_update >= last_backfill_time ) ||
	     (last_part_update >= last_backfill_time ) ) {
		rc = true;
	}
	slurm_mutex_unlock( &thread_flag_mutex );

	return rc;
}

/* Create a reservation for a job in the future */
static void _add_reservation(uint32_t start_time, uint32_t end_reserve,
			     bitstr_t *res_bitmap,
			     node_space_map_t *node_space,
			     int *node_space_recs)
{
	bool placed = false;
	int i, j;

#if 0	
	info("add job start:%u end:%u", start_time, end_reserve);
	for (j = 0; ; ) {
		info("node start:%u end:%u",
		     (uint32_t) node_space[j].begin_time,
		     (uint32_t) node_space[j].end_time);
		if ((j = node_space[j].next) == 0)
			break;
	}
#endif

	start_time = MAX(start_time, node_space[0].begin_time);
	for (j = 0; ; ) {
		if (node_space[j].end_time > start_time) {
			/* insert start entry record */
			i = *node_space_recs;
			node_space[i].begin_time = start_time;
			node_space[i].end_time = node_space[j].end_time;
			node_space[j].end_time = start_time;
			node_space[i].avail_bitmap =
				bit_copy(node_space[j].avail_bitmap);
			node_space[i].next = node_space[j].next;
			node_space[j].next = i;
			(*node_space_recs)++;
			placed = true;
		}
		if (node_space[j].end_time == start_time) {
			/* no need to insert new start entry record */
			placed = true;
		}
		if (placed == true) {
			while ((j = node_space[j].next)) {
				if (end_reserve < node_space[j].end_time) {
					/* insert end entry record */
					i = *node_space_recs;
					node_space[i].begin_time = end_reserve;
					node_space[i].end_time = node_space[j].
								 end_time;
					node_space[j].end_time = end_reserve;
					node_space[i].avail_bitmap =
						bit_copy(node_space[j].
							 avail_bitmap);
					node_space[i].next = node_space[j].next;
					node_space[j].next = i;
					(*node_space_recs)++;
					break;
				}
				if (end_reserve == node_space[j].end_time) {
					break;
				}
			}
			break;
		}
		if ((j = node_space[j].next) == 0)
			break;
	}

	for (j = 0; ; ) {
		if ((node_space[j].begin_time >= start_time) &&
		    (node_space[j].end_time <= end_reserve))
			bit_and(node_space[j].avail_bitmap, res_bitmap);
		if ((node_space[j].begin_time >= end_reserve) ||
		    ((j = node_space[j].next) == 0))
			break;
	}

	/* Drop records with identical bitmaps (up to one record).
	 * This can significantly improve performance of the backfill tests. */
	for (i = 0; ; ) {
		if ((j = node_space[i].next) == 0)
			break;
		if (!bit_equal(node_space[i].avail_bitmap,
			       node_space[j].avail_bitmap)) {
			i = j;
			continue;
		}
		node_space[i].end_time = node_space[j].end_time;
		node_space[i].next = node_space[j].next;
		FREE_NULL_BITMAP(node_space[j].avail_bitmap);
		break;
	}
}

/*
 * Determine if the resource specification for a new job overlaps with a
 *	reservation that the backfill scheduler has made for a job to be
 *	started in the future.
 * IN use_bitmap - nodes to be allocated
 * IN start_time - start time of job
 * IN end_reserve - end time of job
 */
static bool _test_resv_overlap(node_space_map_t *node_space,
			       bitstr_t *use_bitmap, uint32_t start_time,
			       uint32_t end_reserve)
{
	bool overlap = false;
	int j;

	for (j=0; ; ) {
		if ((node_space[j].end_time   > start_time) &&
		    (node_space[j].begin_time < end_reserve) &&
		    (!bit_super_set(use_bitmap, node_space[j].avail_bitmap))) {
			overlap = true;
			break;
		}
		if ((j = node_space[j].next) == 0)
			break;
	}
	return overlap;
}

/*
 * Delete pack_job_map_t record from pack_job_list
 */
static void _pack_rec_del(void *x)
{
	pack_job_rec_t *rec = (pack_job_rec_t *) x;
	xfree(rec);
}

/*
 * Delete pack_job_map_t record from pack_job_list
 */
static void _pack_map_del(void *x)
{
	pack_job_map_t *map = (pack_job_map_t *) x;
	FREE_NULL_LIST(map->pack_job_list);
	xfree(map);
}

/*
 * Return 1 if a pack_job_map_t record with a specific pack_job_id is found.
 * Always return 1 if "key" is zero.
 */
static int _pack_find_map(void *x, void *key)
{
	pack_job_map_t *map = (pack_job_map_t *) x;
	uint32_t *pack_job_id = (uint32_t *) key;

	if ((pack_job_id == NULL) ||
	    (map->pack_job_id == *pack_job_id))
		return 1;
	return 0;
}

/*
 * Return 1 if a pack_job_rec_t record with a specific job_id is found.
 * Always return 1 if "key" is zero.
 */
static int _pack_find_rec(void *x, void *key)
{
	pack_job_rec_t *rec = (pack_job_rec_t *) x;
	uint32_t *job_id = (uint32_t *) key;

	if ((job_id == NULL) ||
	    (rec->job_id == *job_id))
		return 1;
	return 0;
}

/*
 * Remove vestigial elements from pack_job_list. For still active element,
 * clear the previously computted start time. This is used to periodically clear
 * history so that heterogeneous/pack jobs do not keep getting deferred based
 * upon old system state
 */
static void _pack_start_clear(void)
{
	pack_job_map_t *map;
	ListIterator iter;

	iter = list_iterator_create(pack_job_list);
	while ((map = (pack_job_map_t *) list_next(iter))) {
		if (map->prev_start == 0) {
			list_delete_item(iter);
		} else {
			map->prev_start = 0;
			(void) list_delete_all(map->pack_job_list,
					       _pack_find_rec, NULL);
		}
	}
	list_iterator_destroy(iter);
}

/*
 * For a given pack_job_map_t record, determine the earliest that it can start,
 * which is the time at which it's latest starting component begins. The
 * "exclude_job_id" is used to exclude a pack job component currntly being
 * tested to start, presumably in a different partition.
 */
static time_t _pack_start_compute(pack_job_map_t *map, uint32_t exclude_job_id)
{
	ListIterator iter;
	pack_job_rec_t *rec;
	time_t latest_start = map->prev_start;

	iter = list_iterator_create(map->pack_job_list);
	while ((rec = (pack_job_rec_t *) list_next(iter))) {
		if (rec->job_id == exclude_job_id)
			continue;
		latest_start = MAX(latest_start, rec->latest_start);
	}
	list_iterator_destroy(iter);

	return latest_start;
}

/*
 * Return the earliest that a job can start based upon _other_ components of
 * that same heterogeneous/pack job. Return 0 if no limitation.
 *
 * If the job's state reason is BeginTime (the way all pack jobs start) and that
 * time is passed, then clear the reason field.
 */
static time_t _pack_start_find(struct job_record *job_ptr, time_t now)
{
	pack_job_map_t *map;
	time_t latest_start = (time_t) 0;

	if (job_ptr->pack_job_id) {
		map = (pack_job_map_t *) list_find_first(pack_job_list,
							 _pack_find_map,
							 &job_ptr->pack_job_id);
		if (map) {
			latest_start = _pack_start_compute(map,
							   job_ptr->job_id);
		}

		/*
		 * All pack jobs are submitted with a begin time in the future
		 * so that all components can be submitted before any of them
		 * are scheduled, but we want to clear the BeginTime reason
		 * as soon as possible to avoid confusing users
		 */
		if (job_ptr->details->begin_time <= now) {
			if (job_ptr->state_reason == WAIT_TIME) {
				job_ptr->state_reason = WAIT_NO_REASON;
				last_job_update = now;
			}
			if (job_ptr->state_reason_prev == WAIT_TIME) {
				job_ptr->state_reason_prev = WAIT_NO_REASON;
				last_job_update = now;
			}
		}

		if (latest_start && (debug_flags & DEBUG_FLAG_HETERO_JOBS)) {
			long int delay = MAX(0, latest_start - time(NULL));
			info("Job %u+%u (%u) in partition %s expected to start in %ld secs",
			     job_ptr->pack_job_id, job_ptr->pack_job_offset,
			     job_ptr->job_id, job_ptr->part_ptr->name, delay);
		}
	}

	return latest_start;
}

/*
 * Record the earliest that a pack job component can start. If it can be
 * started in multiple partitions, we only record the the earliest start time
 * for the job in any partition.
 */
static void _pack_start_set(struct job_record *job_ptr, time_t latest_start,
			    uint32_t comp_time_limit)
{
	pack_job_map_t *map;
	pack_job_rec_t *rec;

	if (comp_time_limit == NO_VAL)
		comp_time_limit = job_ptr->time_limit;
	if (job_ptr->pack_job_id) {
		map = (pack_job_map_t *) list_find_first(pack_job_list,
							 _pack_find_map,
							 &job_ptr->pack_job_id);
		if (map) {
			if (!map->comp_time_limit) {
				map->comp_time_limit = comp_time_limit;
			} else {
				map->comp_time_limit = MIN(map->comp_time_limit,
							   comp_time_limit);
			}
			rec = list_find_first(map->pack_job_list,
					      _pack_find_rec,
					      &job_ptr->job_id);
			if (rec && (rec->latest_start <= latest_start)) {
				/*
				 * This job can start an earlier time in
				 * some other partition, so ignore new info
				 */
			} else if (rec) {
				rec->latest_start = latest_start;
				rec->part_ptr = job_ptr->part_ptr;
			} else {
				rec = xmalloc(sizeof(pack_job_rec_t));
				rec->job_id = job_ptr->job_id;
				rec->job_ptr = job_ptr;
				rec->latest_start = latest_start;
				rec->part_ptr = job_ptr->part_ptr;
				list_append(map->pack_job_list, rec);
			}
		} else {
			rec = xmalloc(sizeof(pack_job_rec_t));
			rec->job_id = job_ptr->job_id;
			rec->job_ptr = job_ptr;
			rec->latest_start = latest_start;
			rec->part_ptr = job_ptr->part_ptr;
			map = xmalloc(sizeof(pack_job_map_t));
			map->comp_time_limit = comp_time_limit;
			map->pack_job_id = job_ptr->pack_job_id;
			map->pack_job_list = list_create(_pack_rec_del);
			list_append(map->pack_job_list, rec);
			list_append(pack_job_list, map);
		}

		if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
			time_t latest_start = _pack_start_compute(map, 0);
			long int delay = MAX(0, latest_start - time(NULL));
			info("Job %u+%u (%u) in partition %s set to start in %ld secs",
			     job_ptr->pack_job_id, job_ptr->pack_job_offset,
			     job_ptr->job_id, job_ptr->part_ptr->name, delay);
		}
	}
}

/*
 * Return TRUE if we have expected start times for all components of a pack job
 * and all components are valid and runable.
 *
 * NOTE: This should never happen, but we will also start the job if all of the
 * other components are already running,
 */
static bool _pack_job_full(pack_job_map_t *map)
{
	struct job_record *pack_job_ptr, *job_ptr;
	ListIterator iter;
	bool rc = true;

	/*
	 * With bf_continue configured, the original job could have
	 * been cancelled and purged. Validate job record here.
	 */
	pack_job_ptr = find_job_record(map->pack_job_id);
	if (!pack_job_ptr || (pack_job_ptr->magic != JOB_MAGIC) ||
	    (pack_job_ptr->pack_job_id != map->pack_job_id) ||
	    !pack_job_ptr->pack_job_list ||
	    (!IS_JOB_RUNNING(pack_job_ptr) &&
	     !_job_runnable_now(pack_job_ptr))) {
		return false;
	}

	iter = list_iterator_create(pack_job_ptr->pack_job_list);
	while ((job_ptr = (struct job_record *) list_next(iter))) {
		if ((job_ptr->magic != JOB_MAGIC) ||
		    (job_ptr->pack_job_id != map->pack_job_id)) {
			rc = false;	/* bad job pointer */
			break;
		}
		if (IS_JOB_RUNNING(job_ptr))
			continue;
		if (!list_find_first(map->pack_job_list, _pack_find_rec,
				     &job_ptr->job_id) ||
		    !_job_runnable_now(job_ptr)) {
			rc = false;
			break;
		}
	}
	list_iterator_destroy(iter);

	return rc;
}

/*
 * Determine if all components of a pack job can be started now or are
 * prevented from doing so because of association or QOS limits.
 * Return true if they can all start.
 *
 * NOTE: That a pack job passes this test does not mean that it will be able
 * to run. For example, this test assumues resource allocation at the CPU level.
 * If each task is allocated one core, with 2 CPUs, then the CPU limit test
 * would not be accurate.
 */
static bool _pack_job_limit_check(pack_job_map_t *map, time_t now)
{
	struct job_record *job_ptr;
	pack_job_rec_t *rec;
	ListIterator iter;
	int begun_jobs = 0, fini_jobs = 0, slurmctld_tres_size;
	bool runnable = true;
	uint32_t selected_node_cnt;
	uint64_t tres_req_cnt[slurmctld_tres_cnt];
	uint64_t **tres_alloc_save = NULL;

	tres_alloc_save = xmalloc(sizeof(uint64_t *) *
				  list_count(map->pack_job_list));
	slurmctld_tres_size = sizeof(uint64_t) * slurmctld_tres_cnt;
	iter = list_iterator_create(map->pack_job_list);
	while ((rec = (pack_job_rec_t *) list_next(iter))) {
		job_ptr = rec->job_ptr;
		job_ptr->part_ptr = rec->part_ptr;
		selected_node_cnt = job_ptr->node_cnt_wag;
		memcpy(tres_req_cnt, job_ptr->tres_req_cnt,
		       slurmctld_tres_size);
		tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)(job_ptr->total_cpus ?
					       job_ptr->total_cpus :
					       job_ptr->details->min_cpus);
		tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(
					       job_ptr->details->pn_min_memory,
					       tres_req_cnt[TRES_ARRAY_CPU],
					       selected_node_cnt);
		tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)selected_node_cnt;
		gres_set_job_tres_cnt(job_ptr->gres_list, selected_node_cnt,
				      tres_req_cnt, false);

		if (acct_policy_job_runnable_pre_select(job_ptr) &&
		    acct_policy_job_runnable_post_select(job_ptr,
							 tres_req_cnt)) {
			tres_alloc_save[begun_jobs++] = job_ptr->tres_alloc_cnt;
			job_ptr->tres_alloc_cnt = xmalloc(slurmctld_tres_size);
			memcpy(job_ptr->tres_alloc_cnt, tres_req_cnt,
			       slurmctld_tres_size);
			acct_policy_job_begin(job_ptr);

		} else {
			runnable = false;
			break;
		}
	}

	list_iterator_reset(iter);
	while ((rec = (pack_job_rec_t *) list_next(iter))) {
		job_ptr = rec->job_ptr;
		if (begun_jobs > fini_jobs) {
			time_t end_time_exp = job_ptr->end_time_exp;
			job_ptr->end_time_exp = now;
			acct_policy_job_fini(job_ptr);
			job_ptr->end_time_exp = end_time_exp;
			xfree(job_ptr->tres_alloc_cnt);
			job_ptr->tres_alloc_cnt = tres_alloc_save[fini_jobs++];
		}
	}
	list_iterator_destroy(iter);
	xfree(tres_alloc_save);

	return runnable;
}
/*
 * Start all components of a pack job now
 */
static int _pack_start_now(pack_job_map_t *map, node_space_map_t *node_space)
{
	struct job_record *job_ptr;
	bitstr_t *avail_bitmap = NULL, *exc_core_bitmap = NULL;
	bitstr_t *resv_bitmap = NULL;
	pack_job_rec_t *rec;
	ListIterator iter;
	int mcs_select, rc = SLURM_SUCCESS;
	bool resv_overlap = false;
	time_t now = time(NULL), start_res;
	uint32_t hard_limit;

	iter = list_iterator_create(map->pack_job_list);
	while ((rec = (pack_job_rec_t *) list_next(iter))) {
		bool reset_time = false;
		job_ptr = rec->job_ptr;
		job_ptr->part_ptr = rec->part_ptr;

		/*
		 * Identify the nodes which this job can use
		 */
		start_res = now;
		rc = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				   &exc_core_bitmap, &resv_overlap, false);
		FREE_NULL_BITMAP(exc_core_bitmap);
		if (rc != SLURM_SUCCESS) {
			error("Pack job %u+%u (%u) failed to start due to reservation",
			      job_ptr->pack_job_id, job_ptr->pack_job_offset,
			      job_ptr->job_id);
			FREE_NULL_BITMAP(avail_bitmap);
			break;
		}
		bit_and(avail_bitmap, job_ptr->part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		filter_by_node_owner(job_ptr, avail_bitmap);
		mcs_select = slurm_mcs_get_select(job_ptr);
		filter_by_node_mcs(job_ptr, mcs_select, avail_bitmap);
		if (job_ptr->details->exc_node_bitmap) {
			bit_and_not(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
		}

		if (fed_mgr_job_lock(job_ptr)) {
			error("Pack job %u+%u (%u) failed to start due to fed job lock",
			      job_ptr->pack_job_id, job_ptr->pack_job_offset,
			      job_ptr->job_id);
			FREE_NULL_BITMAP(avail_bitmap);
			continue;
		}

		resv_bitmap = avail_bitmap;
		avail_bitmap = NULL;
		bit_not(resv_bitmap);
		rc = _start_job(job_ptr, resv_bitmap);
		FREE_NULL_BITMAP(resv_bitmap);
		if (rc == SLURM_SUCCESS) {
			/* If the following fails because of network
			 * connectivity, the origin cluster should ask
			 * when it comes back up if the cluster_lock
			 * cluster actually started the job */
			fed_mgr_job_start(job_ptr, job_ptr->start_time);
			if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
				info("Pack job %u+%u (%u) started",
				     job_ptr->pack_job_id,
				     job_ptr->pack_job_offset,
				     job_ptr->job_id);
			}
		} else {
			fed_mgr_job_unlock(job_ptr);
			error("Pack job %u+%u (%u) failed to start",
			      job_ptr->pack_job_id, job_ptr->pack_job_offset,
			      job_ptr->job_id);
			break;
		}
		if (job_ptr->time_min) {
			/* Set time limit as high as possible */
			acct_policy_alter_job(job_ptr, map->comp_time_limit);
			job_ptr->time_limit = map->comp_time_limit;
			reset_time = true;
		}
		if (job_ptr->start_time) {
			if (job_ptr->time_limit == INFINITE)
				hard_limit = YEAR_SECONDS;
			else
				hard_limit = job_ptr->time_limit * 60;
			job_ptr->end_time = job_ptr->start_time + hard_limit;
			/* Only set if start_time. end_time must be set
			 * beforehand for _reset_job_time_limit. */
			if (reset_time)
				_reset_job_time_limit(job_ptr, now, node_space);
		}
		if (reset_time)
			jobacct_storage_job_start_direct(acct_db_conn, job_ptr);
	}
	list_iterator_destroy(iter);

	return rc;
}

/*
 * Deallocate all components if failed pack job start
 */
static void _pack_kill_now(pack_job_map_t *map)
{
	struct job_record *job_ptr;
	pack_job_rec_t *rec;
	ListIterator iter;
	time_t now = time(NULL);
	int cred_lifetime = 1200;
	uint32_t save_bitflags;

	(void) slurm_cred_ctx_get(slurmctld_config.cred_ctx,
				  SLURM_CRED_OPT_EXPIRY_WINDOW,
				  &cred_lifetime);
	iter = list_iterator_create(map->pack_job_list);
	while ((rec = (pack_job_rec_t *) list_next(iter))) {
		job_ptr = rec->job_ptr;
		if (IS_JOB_PENDING(job_ptr))
			continue;
		info("Deallocate job %u+%u (%u) due to pack job start failure",
		     job_ptr->pack_job_id, job_ptr->pack_job_offset,
		     job_ptr->job_id);
		job_ptr->details->begin_time = now + cred_lifetime + 1;
		job_ptr->end_time   = now;
		job_ptr->job_state  = JOB_PENDING | JOB_COMPLETING;
		last_job_update     = now;
		build_cg_bitmap(job_ptr);
		job_completion_logger(job_ptr, false);
		deallocate_nodes(job_ptr, false, false, false);
		/* Since the job_completion_logger() removes the submit,
		 * we need to add it again, but don't stage-out burst buffer */
		save_bitflags = job_ptr->bit_flags;
		job_ptr->bit_flags |= JOB_KILL_HURRY;
		acct_policy_add_job_submit(job_ptr);
		job_ptr->bit_flags = save_bitflags;
		if (!job_ptr->node_bitmap_cg ||
		    (bit_set_count(job_ptr->node_bitmap_cg) == 0))
			batch_requeue_fini(job_ptr);
	}
	list_iterator_destroy(iter);
}

/*
 * If all components of a pack job can start now, then do so
 */
static void _pack_start_test(node_space_map_t *node_space)
{
	ListIterator iter;
	pack_job_map_t *map;
	time_t now = time(NULL);
	int rc;

	iter = list_iterator_create(pack_job_list);
	while ((map = (pack_job_map_t *) list_next (iter))) {
		if (!_pack_job_full(map)) {
			if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
				info("Pack job %u has indefinite start time",
				     map->pack_job_id);
			}
			map->prev_start = now + YEAR_SECONDS;
			continue;
		}

		map->prev_start = _pack_start_compute(map, 0);
		if (map->prev_start > now) {
			if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
				info("Pack job %u should be able to start in %u seconds",
				     map->pack_job_id,
				     (uint32_t) (map->prev_start - now));
			}
			continue;
		}

		if (!_pack_job_limit_check(map, now)) {
			if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
				info("Pack job %u prevented from starting by account/QOS limit",
				     map->pack_job_id);
			}
			map->prev_start = now + YEAR_SECONDS;
			continue;
		}

		if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
			info("Attempting to start pack job %u",
			     map->pack_job_id);
		}
		rc = _pack_start_now(map, node_space);
		if (rc != SLURM_SUCCESS) {
			if (debug_flags & DEBUG_FLAG_HETERO_JOBS) {
				info("Failed to start pack job %u",
				     map->pack_job_id);
			}
			_pack_kill_now(map);
		}
	}
	list_iterator_destroy(iter);
}