Skip to content
Snippets Groups Projects
node_scheduler.c 52.4 KiB
Newer Older
		return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;

	/* build sets of usable nodes based upon their configuration */
	error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size);
	if (error_code)
		return error_code;

	/* insure that selected nodes in these node sets */
	if (job_ptr->details->req_node_bitmap) {
		error_code = _nodes_in_sets(job_ptr->details->req_node_bitmap, 
					    node_set_ptr, node_set_size);
		if (error_code) {
			info("No nodes satisfy requirements for JobId=%u",
	/* enforce both user's and partition's node limits */
	/* info("req: %u-%u, %u", job_ptr->details->min_nodes,
	   job_ptr->details->max_nodes, part_ptr->max_nodes); */
	if (super_user) {
		min_nodes = job_ptr->details->min_nodes;
	} else {
		min_nodes = MAX(job_ptr->details->min_nodes, 
				part_ptr->min_nodes);
	}
	if (job_ptr->details->max_nodes == 0) {
		if (super_user)
			max_nodes = INFINITE;
		else
			max_nodes = part_ptr->max_nodes;
	} else if (super_user)
		max_nodes = job_ptr->details->max_nodes;
	else
		max_nodes = MIN(job_ptr->details->max_nodes, 
				part_ptr->max_nodes);
	max_nodes = MIN(max_nodes, 500000);	/* prevent overflows */
	if (job_ptr->details->max_nodes)
		req_nodes = max_nodes;
	else
		req_nodes = min_nodes;
	/* info("nodes:%u:%u:%u", min_nodes, req_nodes, max_nodes); */
	if (max_nodes < min_nodes) {
		error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
		error_code = _pick_best_nodes(node_set_ptr, node_set_size,
					      &select_bitmap, job_ptr,
					      part_ptr, min_nodes, max_nodes,
					      req_nodes);
		job_ptr->state_reason = WAIT_RESOURCES;
		if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
			/* Required nodes are down or 
			 * too many nodes requested */
			debug3("JobId=%u not runnable with present config",
			if (job_ptr->priority != 0)  /* Move to end of queue */
				job_ptr->priority = 1;
			last_job_update = time(NULL);
		} else if (error_code == ESLURM_NODES_BUSY)
			slurm_sched_job_is_pending();
	if (test_only) {	/* set if job not highest priority */
		slurm_sched_job_is_pending();
Moe Jette's avatar
Moe Jette committed
		error_code = SLURM_SUCCESS;
		goto cleanup;
	/* This job may be getting requeued, clear vestigial 
	 * state information before over-writting and leaking 
	 * memory. */
	FREE_NULL_BITMAP(job_ptr->node_bitmap);
	xfree(job_ptr->nodes);

	job_ptr->node_bitmap = select_bitmap;
	if (select_g_job_begin(job_ptr) != SLURM_SUCCESS) {
		/* Leave job queued, something is hosed */
		error("select_g_job_begin(%u): %m", job_ptr->job_id);
		error_code = ESLURM_NODES_BUSY;
		goto cleanup;
	}
	/* assign the nodes and stage_in the job */
	job_ptr->state_reason = WAIT_NO_REASON;
	job_ptr->nodes = bitmap2node_name(select_bitmap);
	select_bitmap = NULL;	/* nothing left to free */
	build_node_details(job_ptr);
	job_ptr->start_time = job_ptr->time_last_active = time(NULL);
	if (job_ptr->time_limit == NO_VAL)
		job_ptr->time_limit = part_ptr->max_time;
	if (job_ptr->time_limit == INFINITE)
		job_ptr->end_time = job_ptr->start_time + 
				    (365 * 24 * 60 * 60); /* secs in year */
		job_ptr->end_time = job_ptr->start_time + 
				    (job_ptr->time_limit * 60);   /* secs */
	if (job_ptr->mail_type & MAIL_JOB_BEGIN)
		mail_job_info(job_ptr, MAIL_JOB_BEGIN);
	if (select_node_bitmap)
		*select_node_bitmap = select_bitmap;
	else
		FREE_NULL_BITMAP(select_bitmap);
		for (i = 0; i < node_set_size; i++)
			FREE_NULL_BITMAP(node_set_ptr[i].my_bitmap);
/*
 * _build_node_list - identify which nodes could be allocated to a job
 * IN job_ptr - pointer to node to be scheduled
 * OUT node_set_pptr - list of node sets which could be used for the job
 * OUT node_set_size - number of node_set entries
 * RET error code 
 */
static int _build_node_list(struct job_record *job_ptr, 
			    struct node_set **node_set_pptr,
			    int *node_set_size)
{
	int node_set_inx;
	struct node_set *node_set_ptr;
	struct config_record *config_ptr;
	struct part_record *part_ptr = job_ptr->part_ptr;
	int tmp_feature, check_node_config, config_filter = 0;
	struct job_details *detail_ptr = job_ptr->details;
	bitstr_t *exc_node_mask = NULL;
	multi_core_data_t *mc_ptr = detail_ptr->mc_ptr;
	node_set_ptr = (struct node_set *) 
			xmalloc(sizeof(struct node_set) * 2);
	node_set_ptr[node_set_inx+1].my_bitmap = NULL;
	if (detail_ptr->exc_node_bitmap) {
		exc_node_mask = bit_copy(detail_ptr->exc_node_bitmap);
		if (exc_node_mask == NULL)
			fatal("bit_copy malloc failure");
	config_iterator = list_iterator_create(config_list);
	if (config_iterator == NULL)
		fatal("list_iterator_create malloc failure");

	while ((config_ptr = (struct config_record *) 
			list_next(config_iterator))) {
		tmp_feature = _valid_features(job_ptr->details->features,
					      config_ptr->feature);
		if (tmp_feature == 0)
			continue;

		config_filter = 0;
		if ((detail_ptr->job_min_procs    > config_ptr->cpus       )
		||  (detail_ptr->job_min_memory   > config_ptr->real_memory) 
		||  (detail_ptr->job_min_tmp_disk > config_ptr->tmp_disk))
		if (mc_ptr
		&&  ((mc_ptr->min_sockets      > config_ptr->sockets    )
		||   (mc_ptr->min_cores        > config_ptr->cores      )
		||   (mc_ptr->min_threads      > config_ptr->threads    )
		||   (mc_ptr->job_min_sockets  > config_ptr->sockets    )
		||   (mc_ptr->job_min_cores    > config_ptr->cores      )
		||   (mc_ptr->job_min_threads  > config_ptr->threads    )))
			config_filter = 1;
		
		/* since nodes can register with more resources than defined */
		/* in the configuration, we want to use those higher values */
		/* for scheduling, but only as needed (slower) */
		if (slurmctld_conf.fast_schedule) {
			if (config_filter)
				continue;
			check_node_config = 1;
		} else
			check_node_config = 0;

		node_set_ptr[node_set_inx].my_bitmap =
		    bit_copy(config_ptr->node_bitmap);
		if (node_set_ptr[node_set_inx].my_bitmap == NULL)
			fatal("bit_copy malloc failure");
		bit_and(node_set_ptr[node_set_inx].my_bitmap,
			part_ptr->node_bitmap);
		if (exc_node_mask)
			bit_and(node_set_ptr[node_set_inx].my_bitmap,
				exc_node_mask);
		node_set_ptr[node_set_inx].nodes =
			bit_set_count(node_set_ptr[node_set_inx].my_bitmap);
		if (check_node_config && 
		    (node_set_ptr[node_set_inx].nodes != 0))
			_filter_nodes_in_set(&node_set_ptr[node_set_inx], 
					     detail_ptr);

		if (node_set_ptr[node_set_inx].nodes == 0) {
			FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
			continue;
		}
		node_set_ptr[node_set_inx].cpus_per_node =
			config_ptr->cpus;
		node_set_ptr[node_set_inx].real_memory =
			config_ptr->real_memory;		
		node_set_ptr[node_set_inx].weight =
		    config_ptr->weight;
		node_set_ptr[node_set_inx].feature = tmp_feature;
		debug2("found %d usable nodes from config containing %s",
		       node_set_ptr[node_set_inx].nodes, config_ptr->nodes);

		node_set_inx++;
		xrealloc(node_set_ptr,
			 sizeof(struct node_set) * (node_set_inx + 2));
		node_set_ptr[node_set_inx + 1].my_bitmap = NULL;
	}
	list_iterator_destroy(config_iterator);
	/* eliminate last (incomplete) node_set record */
	FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
	FREE_NULL_BITMAP(exc_node_mask);

	if (node_set_inx == 0) {
		info("No nodes satisfy job %u requirements", 
		     job_ptr->job_id);
		xfree(node_set_ptr);
		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
	}

	*node_set_size = node_set_inx;
	*node_set_pptr = node_set_ptr;
	return SLURM_SUCCESS;
}

/* Remove from the node set any nodes which lack sufficient resources 
 *	to satisfy the job's request */
static void _filter_nodes_in_set(struct node_set *node_set_ptr,
				 struct job_details *job_con)
	multi_core_data_t *mc_ptr = job_con->mc_ptr;
	if (slurmctld_conf.fast_schedule) {	/* test config records */
		struct config_record *node_con = NULL;
		for (i = 0; i < node_record_count; i++) {
			int job_ok = 0, job_mc_ptr_ok = 0;
			if (bit_test(node_set_ptr->my_bitmap, i) == 0)
			node_con = node_record_table_ptr[i].config_ptr;
			if ((job_con->job_min_procs    <= node_con->cpus)
			&&  (job_con->job_min_memory   <= node_con->real_memory)
			&&  (job_con->job_min_tmp_disk <= node_con->tmp_disk))
				job_ok = 1;
			&&  ((mc_ptr->min_sockets      <= node_con->sockets)
			&&   (mc_ptr->min_cores        <= node_con->cores  )
			&&   (mc_ptr->min_threads      <= node_con->threads)
			&&   (mc_ptr->job_min_sockets  <= node_con->sockets)
			&&   (mc_ptr->job_min_cores    <= node_con->cores  )
			&&   (mc_ptr->job_min_threads  <= node_con->threads)))
				job_mc_ptr_ok = 1;
			if (job_ok && (!mc_ptr || job_mc_ptr_ok))

			bit_clear(node_set_ptr->my_bitmap, i);
			if ((--(node_set_ptr->nodes)) == 0)
				break;
	} else {	/* fast_schedule == 0, test individual node records */
		struct node_record   *node_ptr = NULL;
		for (i = 0; i < node_record_count; i++) {
			int job_ok = 0, job_mc_ptr_ok = 0;
			if (bit_test(node_set_ptr->my_bitmap, i) == 0)
				continue;
			node_ptr = &node_record_table_ptr[i];
			if ((job_con->job_min_procs    <= node_ptr->cpus)
			&&  (job_con->job_min_memory   <= node_ptr->real_memory)
			&&  (job_con->job_min_tmp_disk <= node_ptr->tmp_disk))
				job_ok = 1;
			&&  ((mc_ptr->min_sockets      <= node_ptr->sockets)
			&&   (mc_ptr->min_cores        <= node_ptr->cores  )
			&&   (mc_ptr->min_threads      <= node_ptr->threads)
			&&   (mc_ptr->job_min_sockets  <= node_ptr->sockets)
			&&   (mc_ptr->job_min_cores    <= node_ptr->cores  )
			&&   (mc_ptr->job_min_threads  <= node_ptr->threads)))
				job_mc_ptr_ok = 1;
			if (job_ok && (!mc_ptr || job_mc_ptr_ok))
			bit_clear(node_set_ptr->my_bitmap, i);
			if ((--(node_set_ptr->nodes)) == 0)
				break;
		}
 * _nodes_in_sets - Determine if required nodes are included in node_set(s)
 * IN req_bitmap - nodes specifically required by the job 
 * IN node_set_ptr - sets of valid nodes
 * IN node_set_size - count of node_set entries
 * RET 0 if in set, otherwise an error code
 */
static int _nodes_in_sets(bitstr_t *req_bitmap, 
			  struct node_set * node_set_ptr, 
			  int node_set_size)
{
	bitstr_t *scratch_bitmap = NULL;
	int error_code = SLURM_SUCCESS, i;

	for (i=0; i<node_set_size; i++) {
		if (scratch_bitmap)
			bit_or(scratch_bitmap,
			       node_set_ptr[i].my_bitmap);
		else {
			scratch_bitmap =
			    bit_copy(node_set_ptr[i].my_bitmap);
			if (scratch_bitmap == NULL)
				fatal("bit_copy malloc failure");
		}
	}

	if ((scratch_bitmap == NULL)
	    || (bit_super_set(req_bitmap, scratch_bitmap) != 1))
		error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;

	FREE_NULL_BITMAP(scratch_bitmap);
	return error_code;
}
 * build_node_details - set cpu counts and addresses for allocated nodes:
 *	cpu_count_reps, cpus_per_node, node_addr, node_cnt, num_cpu_groups
 * IN job_ptr - pointer to a job record
extern void build_node_details(struct job_record *job_ptr)
	hostlist_t host_list = NULL;
	struct node_record *node_ptr;
	char *this_node_name;
        int error_code = SLURM_SUCCESS, cr_enabled = 0;
	if ((job_ptr->node_bitmap == NULL) || (job_ptr->nodes == NULL)) {
		/* No nodes allocated, we're done... */
		job_ptr->num_cpu_groups = 0;
		job_ptr->node_cnt = 0;
		job_ptr->cpus_per_node = NULL;
		job_ptr->node_addr = NULL;
		job_ptr->alloc_lps_cnt = 0;
		xfree(job_ptr->alloc_lps);
Moe Jette's avatar
Moe Jette committed
		return;
Danny Auble's avatar
Danny Auble committed
	
	/* Use hostlist here to insure ordering of info matches that of srun */
	if ((host_list = hostlist_create(job_ptr->nodes)) == NULL)
		fatal("hostlist_create error for %s: %m", job_ptr->nodes);

Danny Auble's avatar
Danny Auble committed
	job_ptr->node_cnt = hostlist_count(host_list);	

	xrealloc(job_ptr->cpus_per_node, 
		(sizeof(uint32_t) * job_ptr->node_cnt));
	xrealloc(job_ptr->cpu_count_reps, 
		(sizeof(uint32_t) * job_ptr->node_cnt));
	xrealloc(job_ptr->node_addr, 
		(sizeof(slurm_addr) * job_ptr->node_cnt));	
        job_ptr->alloc_lps_cnt = 0;
        xfree(job_ptr->alloc_lps);
        if (job_ptr->cr_enabled) {
                cr_enabled = job_ptr->cr_enabled;
                job_ptr->alloc_lps = xmalloc(job_ptr->node_cnt * sizeof(int));
                job_ptr->alloc_lps_cnt = job_ptr->node_cnt;
	while ((this_node_name = hostlist_shift(host_list))) {
		node_ptr = find_node_record(this_node_name);
			int usable_lps = 0;
#ifdef HAVE_BG
			if(job_ptr->node_cnt == 1) {
				memcpy(&job_ptr->node_addr[node_inx++],
				       &node_ptr->slurm_addr, 
				       sizeof(slurm_addr));
				cpu_inx++;
				
				job_ptr->cpus_per_node[cpu_inx] =
					job_ptr->num_procs;
				job_ptr->cpu_count_reps[cpu_inx] = 1;
				goto cleanup;
			error_code = select_g_get_extra_jobinfo( 
				node_ptr, job_ptr, SELECT_AVAIL_CPUS, 
				&usable_lps);
			if (error_code == SLURM_SUCCESS) {
				if (cr_enabled && job_ptr->alloc_lps) {
					job_ptr->alloc_lps[cr_count++] =
								usable_lps;
				}
			} else {
					xfree(job_ptr->alloc_lps); 
					job_ptr->alloc_lps = NULL;
					job_ptr->alloc_lps_cnt = 0;
				}
				error("Unable to get extra jobinfo "
				      "from JobId=%u", job_ptr->job_id);
			memcpy(&job_ptr->node_addr[node_inx++],
			       &node_ptr->slurm_addr, sizeof(slurm_addr));
			if ((cpu_inx == -1) ||
			    (job_ptr->cpus_per_node[cpu_inx] !=
			     usable_lps)) {
				job_ptr->cpus_per_node[cpu_inx] =
				job_ptr->cpu_count_reps[cpu_inx] = 1;
			} else
				job_ptr->cpu_count_reps[cpu_inx]++;
			error("Invalid node %s in JobId=%u",
			      this_node_name, job_ptr->job_id);
#ifdef HAVE_BG
 cleanup:	
#endif
		free(this_node_name);
	hostlist_destroy(host_list);
	if (job_ptr->node_cnt != node_inx) {
		error("Node count mismatch for JobId=%u (%u,%u)",
		      job_ptr->job_id, job_ptr->node_cnt, node_inx);
	job_ptr->num_cpu_groups = cpu_inx + 1;
	if ((cr_enabled) && (error_code == SLURM_SUCCESS)) {
                /* Update cr node structure with this job's allocated resources */
                error_code = select_g_update_nodeinfo(job_ptr);
                if(error_code != SLURM_SUCCESS)
                      fatal("Unable to update nodeinfo JobId=%u",
Danny Auble's avatar
Danny Auble committed
                            job_ptr->job_id);
 * _valid_features - determine if the requested features are satisfied by
 *	those available
 * IN requested - requested features (by a job)
 * IN available - available features (on a node)
 * RET 0 if request is not satisfied, otherwise an integer indicating which 
 *	mutually exclusive feature is satisfied. for example
 *	_valid_features("[fs1|fs2|fs3|fs4]", "fs3") returns 3. see the 
 *	slurm administrator and user guides for details. returns 1 if 
 *	requirements are satisfied without mutually exclusive feature list.
static int _valid_features(char *requested, char *available)
	char *tmp_requested, *str_ptr1;
	int bracket, found, i, option, position, result;
	int last_op;		/* last operation 0 for or, 1 for and */
	int save_op = 0, save_result = 0;	/* for bracket support */

	if (requested == NULL)
		return 1;	/* no constraints */
	if (available == NULL)
		return 0;	/* no features */

	tmp_requested = xstrdup(requested);
	bracket = option = position = 0;
	str_ptr1 = tmp_requested;	/* start of feature name */
	result = last_op = 1;	/* assume good for now */
	for (i = 0;; i++) {
		if (tmp_requested[i] == (char) NULL) {
			if (strlen(str_ptr1) == 0)
			found = _match_feature(str_ptr1, available);
			if (last_op == 1)	/* and */
				result &= found;

		if (tmp_requested[i] == '&') {
			if (bracket != 0) {
				debug("_valid_features: parsing failure on %s",
					requested);
			tmp_requested[i] = (char) NULL;
			found = _match_feature(str_ptr1, available);
			if (last_op == 1)	/* and */
				result &= found;
			else	/* or */
				result |= found;
			str_ptr1 = &tmp_requested[i + 1];
			last_op = 1;	/* and */

		} else if (tmp_requested[i] == '|') {
			tmp_requested[i] = (char) NULL;
			found = _match_feature(str_ptr1, available);
			if (bracket != 0) {
				if (found)
					option = position;
				position++;
			}
			if (last_op == 1)	/* and */
				result &= found;
			else	/* or */
				result |= found;
			str_ptr1 = &tmp_requested[i + 1];
			last_op = 0;	/* or */

		} else if (tmp_requested[i] == '[') {
			bracket++;
			position = 1;
			save_op = last_op;
			save_result = result;
			last_op = result = 1;
			str_ptr1 = &tmp_requested[i + 1];

		} else if (tmp_requested[i] == ']') {
			tmp_requested[i] = (char) NULL;
			found = _match_feature(str_ptr1, available);
			if (found)
				option = position;
			result |= found;
			if (save_op == 1)	/* and */
				result &= save_result;
			else	/* or */
				result |= save_result;
			if ((tmp_requested[i + 1] == '&')
			    && (bracket == 1)) {
				last_op = 1;
				str_ptr1 = &tmp_requested[i + 2];
			} else if ((tmp_requested[i + 1] == '|')
				   && (bracket == 1)) {
				last_op = 0;
				str_ptr1 = &tmp_requested[i + 2];
			} else if ((tmp_requested[i + 1] == (char) NULL)
				   && (bracket == 1)) {
				debug("_valid_features: parsing failure on %s",
					requested);

/*
 * re_kill_job - for a given job, deallocate its nodes for a second time, 
 *	basically a cleanup for failed deallocate() calls
 * IN job_ptr - pointer to terminating job (already in some COMPLETING state)
 * globals: node_record_count - number of nodes in the system
 *	node_record_table_ptr - pointer to global node table
 */
extern void re_kill_job(struct job_record *job_ptr)
{
	kill_job_msg_t *kill_job;
	agent_arg_t *agent_args;
	hostlist_t kill_hostlist = hostlist_create("");
	char host_str[64];

	xassert(job_ptr);
	xassert(job_ptr->details);

	agent_args = xmalloc(sizeof(agent_arg_t));
	agent_args->msg_type = REQUEST_TERMINATE_JOB;
	agent_args->hostlist = hostlist_create("");
	agent_args->retry = 0;
	kill_job = xmalloc(sizeof(kill_job_msg_t));
	kill_job->job_id  = job_ptr->job_id;
	kill_job->job_uid = job_ptr->user_id;
	kill_job->time    = time(NULL);
	kill_job->select_jobinfo = select_g_copy_jobinfo(
			job_ptr->select_jobinfo);

	for (i = 0; i < node_record_count; i++) {
		struct node_record *node_ptr = &node_record_table_ptr[i];
		if ((job_ptr->node_bitmap == NULL) ||
		    (bit_test(job_ptr->node_bitmap, i) == 0))
		if ((node_ptr->node_state & NODE_STATE_BASE) 
				== NODE_STATE_DOWN) {
			/* Consider job already completed */
			bit_clear(job_ptr->node_bitmap, i);
			if (node_ptr->comp_job_cnt)
				(node_ptr->comp_job_cnt)--;
			if ((--job_ptr->node_cnt) == 0) {
				job_ptr->job_state &= (~JOB_COMPLETING);
		if (node_ptr->node_state & NODE_STATE_NO_RESPOND)
			continue;
		(void) hostlist_push_host(kill_hostlist, node_ptr->name);
#ifdef HAVE_FRONT_END		/* Operate only on front-end */
		hostlist_push(agent_args->hostlist, node_ptr->name);
		agent_args->node_count++;
	}

	if (agent_args->node_count == 0) {
		xfree(kill_job);
		xfree(agent_args);
		hostlist_destroy(kill_hostlist);
	hostlist_uniq(kill_hostlist);
	hostlist_ranged_string(kill_hostlist, 
			sizeof(host_str), host_str);
	if (job_ptr->job_id != last_job_id) {
		info("Resending TERMINATE_JOB request JobId=%u BPlist=%s",
			job_ptr->job_id, host_str);
	} else {
		debug("Resending TERMINATE_JOB request JobId=%u BPlist=%s",
			job_ptr->job_id, host_str);
	}
	if (job_ptr->job_id != last_job_id) {
		info("Resending TERMINATE_JOB request JobId=%u Nodelist=%s",
	} else {
		debug("Resending TERMINATE_JOB request JobId=%u Nodelist=%s",
			job_ptr->job_id, host_str);
	}
	hostlist_destroy(kill_hostlist);