Newer
Older
return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
/* build sets of usable nodes based upon their configuration */
error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size);
if (error_code)
return error_code;
/* insure that selected nodes in these node sets */
if (job_ptr->details->req_node_bitmap) {
error_code = _nodes_in_sets(job_ptr->details->req_node_bitmap,
node_set_ptr, node_set_size);
if (error_code) {
info("No nodes satisfy requirements for JobId=%u",
job_ptr->job_id);
goto cleanup;
/* enforce both user's and partition's node limits */
/* info("req: %u-%u, %u", job_ptr->details->min_nodes,
job_ptr->details->max_nodes, part_ptr->max_nodes); */
if (super_user) {
min_nodes = job_ptr->details->min_nodes;
} else {
min_nodes = MAX(job_ptr->details->min_nodes,
part_ptr->min_nodes);
}
if (job_ptr->details->max_nodes == 0) {
if (super_user)
max_nodes = INFINITE;
else
max_nodes = part_ptr->max_nodes;
} else if (super_user)
max_nodes = job_ptr->details->max_nodes;
else
max_nodes = MIN(job_ptr->details->max_nodes,
part_ptr->max_nodes);
max_nodes = MIN(max_nodes, 500000); /* prevent overflows */
req_nodes = max_nodes;
else
req_nodes = min_nodes;
/* info("nodes:%u:%u:%u", min_nodes, req_nodes, max_nodes); */
error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
error_code = _pick_best_nodes(node_set_ptr, node_set_size,
&select_bitmap, job_ptr,
part_ptr, min_nodes, max_nodes,
req_nodes);
if (error_code) {
job_ptr->state_reason = WAIT_RESOURCES;
if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
/* Required nodes are down or
* too many nodes requested */
debug3("JobId=%u not runnable with present config",
job_ptr->job_id);
if (job_ptr->priority != 0) /* Move to end of queue */
job_ptr->priority = 1;
last_job_update = time(NULL);
} else if (error_code == ESLURM_NODES_BUSY)
slurm_sched_job_is_pending();
goto cleanup;
if (test_only) { /* set if job not highest priority */
slurm_sched_job_is_pending();
/* This job may be getting requeued, clear vestigial
* state information before over-writting and leaking
* memory. */
FREE_NULL_BITMAP(job_ptr->node_bitmap);
xfree(job_ptr->nodes);
job_ptr->node_bitmap = select_bitmap;
if (select_g_job_begin(job_ptr) != SLURM_SUCCESS) {
/* Leave job queued, something is hosed */
error("select_g_job_begin(%u): %m", job_ptr->job_id);
error_code = ESLURM_NODES_BUSY;
goto cleanup;
}
/* assign the nodes and stage_in the job */
job_ptr->state_reason = WAIT_NO_REASON;
job_ptr->nodes = bitmap2node_name(select_bitmap);
select_bitmap = NULL; /* nothing left to free */
allocate_nodes(job_ptr);
build_node_details(job_ptr);
job_ptr->job_state = JOB_RUNNING;
job_ptr->start_time = job_ptr->time_last_active = time(NULL);
if (job_ptr->time_limit == NO_VAL)
job_ptr->time_limit = part_ptr->max_time;
if (job_ptr->time_limit == INFINITE)
job_ptr->end_time = job_ptr->start_time +
(365 * 24 * 60 * 60); /* secs in year */
job_ptr->end_time = job_ptr->start_time +
(job_ptr->time_limit * 60); /* secs */
if (job_ptr->mail_type & MAIL_JOB_BEGIN)
mail_job_info(job_ptr, MAIL_JOB_BEGIN);
cleanup:
if (select_node_bitmap)
*select_node_bitmap = select_bitmap;
else
FREE_NULL_BITMAP(select_bitmap);
if (node_set_ptr) {
for (i = 0; i < node_set_size; i++)
FREE_NULL_BITMAP(node_set_ptr[i].my_bitmap);
xfree(node_set_ptr);
}
return error_code;
}
/*
* _build_node_list - identify which nodes could be allocated to a job
* IN job_ptr - pointer to node to be scheduled
* OUT node_set_pptr - list of node sets which could be used for the job
* OUT node_set_size - number of node_set entries
* RET error code
*/
static int _build_node_list(struct job_record *job_ptr,
struct node_set **node_set_pptr,
int *node_set_size)
{
int node_set_inx;
struct node_set *node_set_ptr;
struct config_record *config_ptr;
struct part_record *part_ptr = job_ptr->part_ptr;
ListIterator config_iterator;
int tmp_feature, check_node_config, config_filter = 0;
struct job_details *detail_ptr = job_ptr->details;
bitstr_t *exc_node_mask = NULL;
multi_core_data_t *mc_ptr = detail_ptr->mc_ptr;
node_set_inx = 0;
node_set_ptr = (struct node_set *)
xmalloc(sizeof(struct node_set) * 2);
node_set_ptr[node_set_inx+1].my_bitmap = NULL;
if (detail_ptr->exc_node_bitmap) {
exc_node_mask = bit_copy(detail_ptr->exc_node_bitmap);
if (exc_node_mask == NULL)
fatal("bit_copy malloc failure");
bit_not(exc_node_mask);
}
config_iterator = list_iterator_create(config_list);
if (config_iterator == NULL)
fatal("list_iterator_create malloc failure");
while ((config_ptr = (struct config_record *)
list_next(config_iterator))) {
tmp_feature = _valid_features(job_ptr->details->features,
config_ptr->feature);
if (tmp_feature == 0)
continue;
config_filter = 0;
if ((detail_ptr->job_min_procs > config_ptr->cpus )
|| (detail_ptr->job_min_memory > config_ptr->real_memory)
|| (detail_ptr->job_min_tmp_disk > config_ptr->tmp_disk))
config_filter = 1;
if (mc_ptr
&& ((mc_ptr->min_sockets > config_ptr->sockets )
|| (mc_ptr->min_cores > config_ptr->cores )
|| (mc_ptr->min_threads > config_ptr->threads )
|| (mc_ptr->job_min_sockets > config_ptr->sockets )
|| (mc_ptr->job_min_cores > config_ptr->cores )
|| (mc_ptr->job_min_threads > config_ptr->threads )))
config_filter = 1;
/* since nodes can register with more resources than defined */
/* in the configuration, we want to use those higher values */
/* for scheduling, but only as needed (slower) */
if (slurmctld_conf.fast_schedule) {
if (config_filter)
continue;
check_node_config = 0;
} else if (config_filter) {
check_node_config = 1;
} else
check_node_config = 0;
node_set_ptr[node_set_inx].my_bitmap =
bit_copy(config_ptr->node_bitmap);
if (node_set_ptr[node_set_inx].my_bitmap == NULL)
fatal("bit_copy malloc failure");
bit_and(node_set_ptr[node_set_inx].my_bitmap,
part_ptr->node_bitmap);
if (exc_node_mask)
bit_and(node_set_ptr[node_set_inx].my_bitmap,
exc_node_mask);
node_set_ptr[node_set_inx].nodes =
bit_set_count(node_set_ptr[node_set_inx].my_bitmap);
if (check_node_config &&
(node_set_ptr[node_set_inx].nodes != 0))
_filter_nodes_in_set(&node_set_ptr[node_set_inx],
detail_ptr);
if (node_set_ptr[node_set_inx].nodes == 0) {
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
continue;
}
node_set_ptr[node_set_inx].cpus_per_node =
node_set_ptr[node_set_inx].real_memory =
config_ptr->real_memory;
node_set_ptr[node_set_inx].weight =
config_ptr->weight;
node_set_ptr[node_set_inx].feature = tmp_feature;

Danny Auble
committed
debug2("found %d usable nodes from config containing %s",
node_set_ptr[node_set_inx].nodes, config_ptr->nodes);
node_set_inx++;
xrealloc(node_set_ptr,
sizeof(struct node_set) * (node_set_inx + 2));
node_set_ptr[node_set_inx + 1].my_bitmap = NULL;
}
list_iterator_destroy(config_iterator);
/* eliminate last (incomplete) node_set record */
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
FREE_NULL_BITMAP(exc_node_mask);
if (node_set_inx == 0) {
info("No nodes satisfy job %u requirements",
job_ptr->job_id);
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
*node_set_size = node_set_inx;
*node_set_pptr = node_set_ptr;
return SLURM_SUCCESS;
}
/* Remove from the node set any nodes which lack sufficient resources
* to satisfy the job's request */
static void _filter_nodes_in_set(struct node_set *node_set_ptr,
struct job_details *job_con)
{
int i;
multi_core_data_t *mc_ptr = job_con->mc_ptr;
if (slurmctld_conf.fast_schedule) { /* test config records */
struct config_record *node_con = NULL;
for (i = 0; i < node_record_count; i++) {
int job_ok = 0, job_mc_ptr_ok = 0;
if (bit_test(node_set_ptr->my_bitmap, i) == 0)
continue;
node_con = node_record_table_ptr[i].config_ptr;
if ((job_con->job_min_procs <= node_con->cpus)
&& (job_con->job_min_memory <= node_con->real_memory)
&& (job_con->job_min_tmp_disk <= node_con->tmp_disk))
job_ok = 1;
&& ((mc_ptr->min_sockets <= node_con->sockets)
&& (mc_ptr->min_cores <= node_con->cores )
&& (mc_ptr->min_threads <= node_con->threads)
&& (mc_ptr->job_min_sockets <= node_con->sockets)
&& (mc_ptr->job_min_cores <= node_con->cores )
&& (mc_ptr->job_min_threads <= node_con->threads)))
job_mc_ptr_ok = 1;
if (job_ok && (!mc_ptr || job_mc_ptr_ok))
continue;
bit_clear(node_set_ptr->my_bitmap, i);
if ((--(node_set_ptr->nodes)) == 0)
break;
}
} else { /* fast_schedule == 0, test individual node records */
struct node_record *node_ptr = NULL;
for (i = 0; i < node_record_count; i++) {
int job_ok = 0, job_mc_ptr_ok = 0;
if (bit_test(node_set_ptr->my_bitmap, i) == 0)
continue;
node_ptr = &node_record_table_ptr[i];
if ((job_con->job_min_procs <= node_ptr->cpus)
&& (job_con->job_min_memory <= node_ptr->real_memory)
&& (job_con->job_min_tmp_disk <= node_ptr->tmp_disk))
job_ok = 1;
&& ((mc_ptr->min_sockets <= node_ptr->sockets)
&& (mc_ptr->min_cores <= node_ptr->cores )
&& (mc_ptr->min_threads <= node_ptr->threads)
&& (mc_ptr->job_min_sockets <= node_ptr->sockets)
&& (mc_ptr->job_min_cores <= node_ptr->cores )
&& (mc_ptr->job_min_threads <= node_ptr->threads)))
job_mc_ptr_ok = 1;
if (job_ok && (!mc_ptr || job_mc_ptr_ok))
bit_clear(node_set_ptr->my_bitmap, i);
if ((--(node_set_ptr->nodes)) == 0)
break;
}
}
}
/*
* _nodes_in_sets - Determine if required nodes are included in node_set(s)
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
* IN req_bitmap - nodes specifically required by the job
* IN node_set_ptr - sets of valid nodes
* IN node_set_size - count of node_set entries
* RET 0 if in set, otherwise an error code
*/
static int _nodes_in_sets(bitstr_t *req_bitmap,
struct node_set * node_set_ptr,
int node_set_size)
{
bitstr_t *scratch_bitmap = NULL;
int error_code = SLURM_SUCCESS, i;
for (i=0; i<node_set_size; i++) {
if (scratch_bitmap)
bit_or(scratch_bitmap,
node_set_ptr[i].my_bitmap);
else {
scratch_bitmap =
bit_copy(node_set_ptr[i].my_bitmap);
if (scratch_bitmap == NULL)
fatal("bit_copy malloc failure");
}
}
if ((scratch_bitmap == NULL)
|| (bit_super_set(req_bitmap, scratch_bitmap) != 1))
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
FREE_NULL_BITMAP(scratch_bitmap);
return error_code;
}
* build_node_details - set cpu counts and addresses for allocated nodes:
* cpu_count_reps, cpus_per_node, node_addr, node_cnt, num_cpu_groups
* IN job_ptr - pointer to a job record
extern void build_node_details(struct job_record *job_ptr)
hostlist_t host_list = NULL;
struct node_record *node_ptr;
char *this_node_name;
int error_code = SLURM_SUCCESS, cr_enabled = 0;
int node_inx = 0, cpu_inx = -1;
int cr_count = 0;
if ((job_ptr->node_bitmap == NULL) || (job_ptr->nodes == NULL)) {
/* No nodes allocated, we're done... */
job_ptr->num_cpu_groups = 0;
job_ptr->node_cnt = 0;
job_ptr->cpus_per_node = NULL;
job_ptr->cpu_count_reps = NULL;
job_ptr->node_addr = NULL;
job_ptr->alloc_lps_cnt = 0;
xfree(job_ptr->alloc_lps);
}
job_ptr->num_cpu_groups = 0;
/* Use hostlist here to insure ordering of info matches that of srun */
if ((host_list = hostlist_create(job_ptr->nodes)) == NULL)
fatal("hostlist_create error for %s: %m", job_ptr->nodes);
job_ptr->node_cnt = hostlist_count(host_list);
xrealloc(job_ptr->cpus_per_node,
(sizeof(uint32_t) * job_ptr->node_cnt));
xrealloc(job_ptr->cpu_count_reps,
(sizeof(uint32_t) * job_ptr->node_cnt));
xrealloc(job_ptr->node_addr,
(sizeof(slurm_addr) * job_ptr->node_cnt));
job_ptr->alloc_lps_cnt = 0;
xfree(job_ptr->alloc_lps);
if (job_ptr->cr_enabled) {
cr_enabled = job_ptr->cr_enabled;
job_ptr->alloc_lps = xmalloc(job_ptr->node_cnt * sizeof(int));
job_ptr->alloc_lps_cnt = job_ptr->node_cnt;
while ((this_node_name = hostlist_shift(host_list))) {
node_ptr = find_node_record(this_node_name);
if (node_ptr) {
#ifdef HAVE_BG
if(job_ptr->node_cnt == 1) {
memcpy(&job_ptr->node_addr[node_inx++],
&node_ptr->slurm_addr,
sizeof(slurm_addr));
cpu_inx++;
job_ptr->cpus_per_node[cpu_inx] =
job_ptr->num_procs;
job_ptr->cpu_count_reps[cpu_inx] = 1;
error_code = select_g_get_extra_jobinfo(
node_ptr, job_ptr, SELECT_AVAIL_CPUS,
&usable_lps);
if (error_code == SLURM_SUCCESS) {
if (cr_enabled && job_ptr->alloc_lps) {
job_ptr->alloc_lps[cr_count++] =
usable_lps;
}
} else {
if (cr_enabled) {
job_ptr->alloc_lps = NULL;
job_ptr->alloc_lps_cnt = 0;
}
error("Unable to get extra jobinfo "
"from JobId=%u", job_ptr->job_id);
memcpy(&job_ptr->node_addr[node_inx++],
&node_ptr->slurm_addr, sizeof(slurm_addr));
if ((cpu_inx == -1) ||
(job_ptr->cpus_per_node[cpu_inx] !=
cpu_inx++;
job_ptr->cpus_per_node[cpu_inx] =
job_ptr->cpu_count_reps[cpu_inx] = 1;
} else
job_ptr->cpu_count_reps[cpu_inx]++;
} else {
error("Invalid node %s in JobId=%u",
this_node_name, job_ptr->job_id);
#ifdef HAVE_BG
cleanup:
#endif
free(this_node_name);
}
hostlist_destroy(host_list);
if (job_ptr->node_cnt != node_inx) {
error("Node count mismatch for JobId=%u (%u,%u)",
job_ptr->job_id, job_ptr->node_cnt, node_inx);
job_ptr->num_cpu_groups = cpu_inx + 1;
if ((cr_enabled) && (error_code == SLURM_SUCCESS)) {
/* Update cr node structure with this job's allocated resources */
error_code = select_g_update_nodeinfo(job_ptr);
if(error_code != SLURM_SUCCESS)
fatal("Unable to update nodeinfo JobId=%u",
* _valid_features - determine if the requested features are satisfied by
* those available
* IN requested - requested features (by a job)
* IN available - available features (on a node)
* RET 0 if request is not satisfied, otherwise an integer indicating which
* mutually exclusive feature is satisfied. for example
* _valid_features("[fs1|fs2|fs3|fs4]", "fs3") returns 3. see the
* slurm administrator and user guides for details. returns 1 if
* requirements are satisfied without mutually exclusive feature list.
static int _valid_features(char *requested, char *available)
char *tmp_requested, *str_ptr1;
int bracket, found, i, option, position, result;
int last_op; /* last operation 0 for or, 1 for and */
int save_op = 0, save_result = 0; /* for bracket support */
if (requested == NULL)
return 1; /* no constraints */
if (available == NULL)
return 0; /* no features */
tmp_requested = xstrdup(requested);
bracket = option = position = 0;
str_ptr1 = tmp_requested; /* start of feature name */
result = last_op = 1; /* assume good for now */
for (i = 0;; i++) {
if (tmp_requested[i] == (char) NULL) {
if (strlen(str_ptr1) == 0)
break;
found = _match_feature(str_ptr1, available);
if (last_op == 1) /* and */
result &= found;
else /* or */
result |= found;
break;
if (tmp_requested[i] == '&') {
if (bracket != 0) {
debug("_valid_features: parsing failure on %s",
requested);
result = 0;
break;
tmp_requested[i] = (char) NULL;
found = _match_feature(str_ptr1, available);
if (last_op == 1) /* and */
result &= found;
else /* or */
result |= found;
str_ptr1 = &tmp_requested[i + 1];
last_op = 1; /* and */
} else if (tmp_requested[i] == '|') {
tmp_requested[i] = (char) NULL;
found = _match_feature(str_ptr1, available);
if (bracket != 0) {
if (found)
option = position;
position++;
}
if (last_op == 1) /* and */
result &= found;
else /* or */
result |= found;
str_ptr1 = &tmp_requested[i + 1];
last_op = 0; /* or */
} else if (tmp_requested[i] == '[') {
bracket++;
position = 1;
save_op = last_op;
save_result = result;
last_op = result = 1;
str_ptr1 = &tmp_requested[i + 1];
} else if (tmp_requested[i] == ']') {
tmp_requested[i] = (char) NULL;
found = _match_feature(str_ptr1, available);
if (found)
option = position;
result |= found;
if (save_op == 1) /* and */
result &= save_result;
else /* or */
result |= save_result;
if ((tmp_requested[i + 1] == '&')
&& (bracket == 1)) {
last_op = 1;
str_ptr1 = &tmp_requested[i + 2];
} else if ((tmp_requested[i + 1] == '|')
&& (bracket == 1)) {
last_op = 0;
str_ptr1 = &tmp_requested[i + 2];
} else if ((tmp_requested[i + 1] == (char) NULL)
&& (bracket == 1)) {
break;
debug("_valid_features: parsing failure on %s",
requested);
result = 0;
break;
bracket = 0;
if (position)
result *= option;
xfree(tmp_requested);
return result;
}
/*
* re_kill_job - for a given job, deallocate its nodes for a second time,
* basically a cleanup for failed deallocate() calls
* IN job_ptr - pointer to terminating job (already in some COMPLETING state)
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
*/
extern void re_kill_job(struct job_record *job_ptr)
{
int i;
kill_job_msg_t *kill_job;
agent_arg_t *agent_args;
hostlist_t kill_hostlist = hostlist_create("");
char host_str[64];

Moe Jette
committed
static uint32_t last_job_id = 0;
xassert(job_ptr);
xassert(job_ptr->details);
agent_args = xmalloc(sizeof(agent_arg_t));
agent_args->msg_type = REQUEST_TERMINATE_JOB;
agent_args->retry = 0;
kill_job = xmalloc(sizeof(kill_job_msg_t));
kill_job->job_id = job_ptr->job_id;
kill_job->job_uid = job_ptr->user_id;
kill_job->time = time(NULL);
kill_job->select_jobinfo = select_g_copy_jobinfo(
job_ptr->select_jobinfo);
for (i = 0; i < node_record_count; i++) {
struct node_record *node_ptr = &node_record_table_ptr[i];
if ((job_ptr->node_bitmap == NULL) ||
(bit_test(job_ptr->node_bitmap, i) == 0))
continue;
if ((node_ptr->node_state & NODE_STATE_BASE)
== NODE_STATE_DOWN) {
/* Consider job already completed */
bit_clear(job_ptr->node_bitmap, i);
if (node_ptr->comp_job_cnt)
(node_ptr->comp_job_cnt)--;
if ((--job_ptr->node_cnt) == 0) {
last_node_update = time(NULL);
job_ptr->job_state &= (~JOB_COMPLETING);
slurm_sched_schedule();
}
continue;
}
if (node_ptr->node_state & NODE_STATE_NO_RESPOND)
continue;
(void) hostlist_push_host(kill_hostlist, node_ptr->name);
#ifdef HAVE_FRONT_END /* Operate only on front-end */
if (agent_args->node_count > 0)
continue;
#endif
hostlist_push(agent_args->hostlist, node_ptr->name);
agent_args->node_count++;
}
if (agent_args->node_count == 0) {
xfree(kill_job);
xfree(agent_args);
hostlist_destroy(kill_hostlist);
return;
}
hostlist_uniq(kill_hostlist);
hostlist_ranged_string(kill_hostlist,
sizeof(host_str), host_str);

Moe Jette
committed
if (job_ptr->job_id != last_job_id) {
info("Resending TERMINATE_JOB request JobId=%u BPlist=%s",
job_ptr->job_id, host_str);

Moe Jette
committed
} else {
debug("Resending TERMINATE_JOB request JobId=%u BPlist=%s",
job_ptr->job_id, host_str);
}

Moe Jette
committed
if (job_ptr->job_id != last_job_id) {
info("Resending TERMINATE_JOB request JobId=%u Nodelist=%s",
job_ptr->job_id, host_str);

Moe Jette
committed
} else {
debug("Resending TERMINATE_JOB request JobId=%u Nodelist=%s",
job_ptr->job_id, host_str);
}

Moe Jette
committed
last_job_id = job_ptr->job_id;
hostlist_destroy(kill_hostlist);
agent_args->msg_args = kill_job;
agent_queue_request(agent_args);