Newer
Older
job_ptr->node_bitmap = select_bitmap;
if (select_g_job_begin(job_ptr) != SLURM_SUCCESS) {
/* Leave job queued, something is hosed */
error("select_g_job_begin(%u): %m", job_ptr->job_id);
error_code = ESLURM_NODES_BUSY;
goto cleanup;
}
/* assign the nodes and stage_in the job */
if (detail_ptr)
detail_ptr->wait_reason = WAIT_NO_REASON;
job_ptr->nodes = bitmap2node_name(select_bitmap);
select_bitmap = NULL; /* nothing left to free */
allocate_nodes(job_ptr);
build_node_details(job_ptr);
job_ptr->job_state = JOB_RUNNING;
job_ptr->start_time = job_ptr->time_last_active = time(NULL);
if (job_ptr->time_limit == NO_VAL)
job_ptr->time_limit = part_ptr->max_time;
if (job_ptr->time_limit == INFINITE)
job_ptr->end_time = job_ptr->start_time +
(365 * 24 * 60 * 60); /* secs in year */
job_ptr->end_time = job_ptr->start_time +
(job_ptr->time_limit * 60); /* secs */
if (job_ptr->mail_type & MAIL_JOB_BEGIN)
mail_job_info(job_ptr, MAIL_JOB_BEGIN);
cleanup:
if (select_node_bitmap)
*select_node_bitmap = select_bitmap;
else
FREE_NULL_BITMAP(select_bitmap);
if (node_set_ptr) {
for (i = 0; i < node_set_size; i++)
FREE_NULL_BITMAP(node_set_ptr[i].my_bitmap);
xfree(node_set_ptr);
}
return error_code;
}
/*
* _build_node_list - identify which nodes could be allocated to a job
* IN job_ptr - pointer to node to be scheduled
* OUT node_set_pptr - list of node sets which could be used for the job
* OUT node_set_size - number of node_set entries
* RET error code
*/
static int _build_node_list(struct job_record *job_ptr,
struct node_set **node_set_pptr,
int *node_set_size)
{
int node_set_inx;
struct node_set *node_set_ptr;
struct config_record *config_ptr;
struct part_record *part_ptr = job_ptr->part_ptr;
ListIterator config_iterator;
int tmp_feature, check_node_config, config_filter = 0;
struct job_details *detail_ptr = job_ptr->details;
bitstr_t *exc_node_mask = NULL;
node_set_inx = 0;
node_set_ptr = (struct node_set *)
xmalloc(sizeof(struct node_set) * 2);
node_set_ptr[node_set_inx+1].my_bitmap = NULL;
if (detail_ptr->exc_node_bitmap) {
exc_node_mask = bit_copy(detail_ptr->exc_node_bitmap);
if (exc_node_mask == NULL)
fatal("bit_copy malloc failure");
bit_not(exc_node_mask);
}
config_iterator = list_iterator_create(config_list);
if (config_iterator == NULL)
fatal("list_iterator_create malloc failure");
while ((config_ptr = (struct config_record *)
list_next(config_iterator))) {
tmp_feature = _valid_features(job_ptr->details->features,
config_ptr->feature);
if (tmp_feature == 0)
continue;
if ((detail_ptr->min_procs > config_ptr->cpus ) ||
(detail_ptr->min_memory > config_ptr->real_memory) ||
(detail_ptr->min_tmp_disk > config_ptr->tmp_disk))
config_filter = 1;
else
config_filter = 0;
/* since nodes can register with more resources than defined */
/* in the configuration, we want to use those higher values */
/* for scheduling, but only as needed (slower) */
if (slurmctld_conf.fast_schedule) {
if (config_filter)
continue;
check_node_config = 0;
} else if (config_filter) {
check_node_config = 1;
} else
check_node_config = 0;
node_set_ptr[node_set_inx].my_bitmap =
bit_copy(config_ptr->node_bitmap);
if (node_set_ptr[node_set_inx].my_bitmap == NULL)
fatal("bit_copy malloc failure");
bit_and(node_set_ptr[node_set_inx].my_bitmap,
part_ptr->node_bitmap);
if (exc_node_mask)
bit_and(node_set_ptr[node_set_inx].my_bitmap,
exc_node_mask);
node_set_ptr[node_set_inx].nodes =
bit_set_count(node_set_ptr[node_set_inx].my_bitmap);
if (check_node_config &&
(node_set_ptr[node_set_inx].nodes != 0))
_filter_nodes_in_set(&node_set_ptr[node_set_inx],
detail_ptr);
if (node_set_ptr[node_set_inx].nodes == 0) {
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
continue;
}
node_set_ptr[node_set_inx].cpus_per_node =
config_ptr->cpus;
node_set_ptr[node_set_inx].weight =
config_ptr->weight;
node_set_ptr[node_set_inx].feature = tmp_feature;

Danny Auble
committed
debug2("found %d usable nodes from config containing %s",
node_set_ptr[node_set_inx].nodes, config_ptr->nodes);
node_set_inx++;
xrealloc(node_set_ptr,
sizeof(struct node_set) * (node_set_inx + 2));
node_set_ptr[node_set_inx + 1].my_bitmap = NULL;
}
list_iterator_destroy(config_iterator);
/* eliminate last (incomplete) node_set record */
FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap);
FREE_NULL_BITMAP(exc_node_mask);
if (node_set_inx == 0) {
info("No nodes satisfy job %u requirements",
job_ptr->job_id);
return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
}
*node_set_size = node_set_inx;
*node_set_pptr = node_set_ptr;
return SLURM_SUCCESS;
}
/* Remove from the node set any nodes which lack sufficient resources
* to satisfy the job's request */
static void _filter_nodes_in_set(struct node_set *node_set_ptr,
struct job_details *job_con)
{
int i;
if (slurmctld_conf.fast_schedule) { /* test config records */
struct config_record *node_con = NULL;
for (i = 0; i < node_record_count; i++) {
if (bit_test(node_set_ptr->my_bitmap, i) == 0)
continue;
node_con = node_record_table_ptr[i].config_ptr;
if ((job_con->min_procs <= node_con->cpus) &&
(job_con->min_memory <= node_con->real_memory) &&
(job_con->min_tmp_disk <= node_con->tmp_disk))
continue;
bit_clear(node_set_ptr->my_bitmap, i);
if ((--(node_set_ptr->nodes)) == 0)
break;
}
} else { /* fast_schedule == 0, test individual node records */
struct node_record *node_ptr = NULL;
for (i = 0; i < node_record_count; i++) {
if (bit_test(node_set_ptr->my_bitmap, i) == 0)
continue;
node_ptr = &node_record_table_ptr[i];
if ((job_con->min_procs <= node_ptr->cpus) &&
(job_con->min_memory <= node_ptr->real_memory) &&
(job_con->min_tmp_disk <= node_ptr->tmp_disk))
continue;
bit_clear(node_set_ptr->my_bitmap, i);
if ((--(node_set_ptr->nodes)) == 0)
break;
}
}
}
/*
* _nodes_in_sets - Determine if required nodes are included in node_set(s)
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
* IN req_bitmap - nodes specifically required by the job
* IN node_set_ptr - sets of valid nodes
* IN node_set_size - count of node_set entries
* RET 0 if in set, otherwise an error code
*/
static int _nodes_in_sets(bitstr_t *req_bitmap,
struct node_set * node_set_ptr,
int node_set_size)
{
bitstr_t *scratch_bitmap = NULL;
int error_code = SLURM_SUCCESS, i;
for (i=0; i<node_set_size; i++) {
if (scratch_bitmap)
bit_or(scratch_bitmap,
node_set_ptr[i].my_bitmap);
else {
scratch_bitmap =
bit_copy(node_set_ptr[i].my_bitmap);
if (scratch_bitmap == NULL)
fatal("bit_copy malloc failure");
}
}
if ((scratch_bitmap == NULL)
|| (bit_super_set(req_bitmap, scratch_bitmap) != 1))
error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
FREE_NULL_BITMAP(scratch_bitmap);
return error_code;
}
* build_node_details - set cpu counts and addresses for allocated nodes:
* cpu_count_reps, cpus_per_node, node_addr, node_cnt, num_cpu_groups
* IN job_ptr - pointer to a job record
extern void build_node_details(struct job_record *job_ptr)
hostlist_t host_list = NULL;
struct node_record *node_ptr;
char *this_node_name;
int error_code = SLURM_SUCCESS, cr_enabled = 0;
int node_inx = 0, cpu_inx = -1;
int cr_count = 0;
if ((job_ptr->node_bitmap == NULL) || (job_ptr->nodes == NULL)) {
/* No nodes allocated, we're done... */
job_ptr->num_cpu_groups = 0;
job_ptr->node_cnt = 0;
job_ptr->cpus_per_node = NULL;
job_ptr->cpu_count_reps = NULL;
job_ptr->node_addr = NULL;
job_ptr->ntask_cnt = 0;
}
job_ptr->num_cpu_groups = 0;
/* Use hostlist here to insure ordering of info matches that of srun */
if ((host_list = hostlist_create(job_ptr->nodes)) == NULL)
fatal("hostlist_create error for %s: %m", job_ptr->nodes);
job_ptr->node_cnt = hostlist_count(host_list);
xrealloc(job_ptr->cpus_per_node,
(sizeof(uint32_t) * job_ptr->node_cnt));
xrealloc(job_ptr->cpu_count_reps,
(sizeof(uint32_t) * job_ptr->node_cnt));
xrealloc(job_ptr->node_addr,
(sizeof(slurm_addr) * job_ptr->node_cnt));
job_ptr->ntask_cnt = 0;
if (job_ptr->cr_enabled) {
cr_enabled = job_ptr->cr_enabled;
job_ptr->ntask = xmalloc(job_ptr->node_cnt * sizeof(int));
job_ptr->ntask_cnt = job_ptr->node_cnt;
}
while ((this_node_name = hostlist_shift(host_list))) {
node_ptr = find_node_record(this_node_name);
if (node_ptr) {
int usable_cpus = 0;
#ifdef HAVE_BG
if(job_ptr->node_cnt == 1) {
memcpy(&job_ptr->node_addr[node_inx++],
&node_ptr->slurm_addr,
sizeof(slurm_addr));
cpu_inx++;
job_ptr->cpus_per_node[cpu_inx] =
job_ptr->num_procs;
job_ptr->cpu_count_reps[cpu_inx] = 1;
if (cr_enabled) {
error_code = select_g_get_extra_jobinfo(
node_ptr, job_ptr,
SELECT_CR_USABLE_CPUS, &usable_cpus);
job_ptr->ntask[cr_count++] = usable_cpus;
if(error_code != SLURM_SUCCESS) {
xfree(job_ptr->ntask);
error("Unable to get extra jobinfo "
"from JobId=%u",
job_ptr->job_id);
}
} else if (slurmctld_conf.fast_schedule) {
usable_cpus = node_ptr->config_ptr->cpus;
} else {
usable_cpus = node_ptr->cpus;
}
memcpy(&job_ptr->node_addr[node_inx++],
&node_ptr->slurm_addr, sizeof(slurm_addr));
if ((cpu_inx == -1) ||
(job_ptr->cpus_per_node[cpu_inx] !=
usable_cpus)) {
cpu_inx++;
job_ptr->cpus_per_node[cpu_inx] =
job_ptr->cpu_count_reps[cpu_inx] = 1;
} else
job_ptr->cpu_count_reps[cpu_inx]++;
} else {
error("Invalid node %s in JobId=%u",
this_node_name, job_ptr->job_id);
}
hostlist_destroy(host_list);
if (job_ptr->node_cnt != node_inx) {
error("Node count mismatch for JobId=%u (%u,%u)",
job_ptr->job_id, job_ptr->node_cnt, node_inx);
job_ptr->num_cpu_groups = cpu_inx + 1;
if ((cr_enabled) && (error_code == SLURM_SUCCESS)) {
error_code = select_g_update_nodeinfo(job_ptr, SELECT_CR_USED_CPUS);
if(error_code != SLURM_SUCCESS)
fatal("Unable to update nodeinfo JobId=%u",
* _valid_features - determine if the requested features are satisfied by
* those available
* IN requested - requested features (by a job)
* IN available - available features (on a node)
* RET 0 if request is not satisfied, otherwise an integer indicating which
* mutually exclusive feature is satisfied. for example
* _valid_features("[fs1|fs2|fs3|fs4]", "fs3") returns 3. see the
* slurm administrator and user guides for details. returns 1 if
* requirements are satisfied without mutually exclusive feature list.
static int _valid_features(char *requested, char *available)
char *tmp_requested, *str_ptr1;
int bracket, found, i, option, position, result;
int last_op; /* last operation 0 for or, 1 for and */
int save_op = 0, save_result = 0; /* for bracket support */
if (requested == NULL)
return 1; /* no constraints */
if (available == NULL)
return 0; /* no features */
tmp_requested = xstrdup(requested);
bracket = option = position = 0;
str_ptr1 = tmp_requested; /* start of feature name */
result = last_op = 1; /* assume good for now */
for (i = 0;; i++) {
if (tmp_requested[i] == (char) NULL) {
if (strlen(str_ptr1) == 0)
break;
found = _match_feature(str_ptr1, available);
if (last_op == 1) /* and */
result &= found;
else /* or */
result |= found;
break;
if (tmp_requested[i] == '&') {
if (bracket != 0) {
debug("_valid_features: parsing failure on %s",
requested);
result = 0;
break;
tmp_requested[i] = (char) NULL;
found = _match_feature(str_ptr1, available);
if (last_op == 1) /* and */
result &= found;
else /* or */
result |= found;
str_ptr1 = &tmp_requested[i + 1];
last_op = 1; /* and */
} else if (tmp_requested[i] == '|') {
tmp_requested[i] = (char) NULL;
found = _match_feature(str_ptr1, available);
if (bracket != 0) {
if (found)
option = position;
position++;
}
if (last_op == 1) /* and */
result &= found;
else /* or */
result |= found;
str_ptr1 = &tmp_requested[i + 1];
last_op = 0; /* or */
} else if (tmp_requested[i] == '[') {
bracket++;
position = 1;
save_op = last_op;
save_result = result;
last_op = result = 1;
str_ptr1 = &tmp_requested[i + 1];
} else if (tmp_requested[i] == ']') {
tmp_requested[i] = (char) NULL;
found = _match_feature(str_ptr1, available);
if (found)
option = position;
result |= found;
if (save_op == 1) /* and */
result &= save_result;
else /* or */
result |= save_result;
if ((tmp_requested[i + 1] == '&')
&& (bracket == 1)) {
last_op = 1;
str_ptr1 = &tmp_requested[i + 2];
} else if ((tmp_requested[i + 1] == '|')
&& (bracket == 1)) {
last_op = 0;
str_ptr1 = &tmp_requested[i + 2];
} else if ((tmp_requested[i + 1] == (char) NULL)
&& (bracket == 1)) {
break;
debug("_valid_features: parsing failure on %s",
requested);
result = 0;
break;
bracket = 0;
if (position)
result *= option;
xfree(tmp_requested);
return result;
}
/*
* re_kill_job - for a given job, deallocate its nodes for a second time,
* basically a cleanup for failed deallocate() calls
* IN job_ptr - pointer to terminating job (already in some COMPLETING state)
* globals: node_record_count - number of nodes in the system
* node_record_table_ptr - pointer to global node table
*/
extern void re_kill_job(struct job_record *job_ptr)
{
int i;
kill_job_msg_t *kill_job;
agent_arg_t *agent_args;
hostlist_t kill_hostlist = hostlist_create("");
char host_str[64];
xassert(job_ptr);
xassert(job_ptr->details);
agent_args = xmalloc(sizeof(agent_arg_t));
agent_args->msg_type = REQUEST_TERMINATE_JOB;
agent_args->retry = 0;
kill_job = xmalloc(sizeof(kill_job_msg_t));
kill_job->job_id = job_ptr->job_id;
kill_job->job_uid = job_ptr->user_id;
kill_job->time = time(NULL);
kill_job->select_jobinfo = select_g_copy_jobinfo(
job_ptr->select_jobinfo);
for (i = 0; i < node_record_count; i++) {
struct node_record *node_ptr = &node_record_table_ptr[i];
if ((job_ptr->node_bitmap == NULL) ||
(bit_test(job_ptr->node_bitmap, i) == 0))
continue;
if ((node_ptr->node_state & NODE_STATE_BASE)
== NODE_STATE_DOWN) {
/* Consider job already completed */
bit_clear(job_ptr->node_bitmap, i);
if (node_ptr->comp_job_cnt)
(node_ptr->comp_job_cnt)--;
if ((--job_ptr->node_cnt) == 0) {
last_node_update = time(NULL);
job_ptr->job_state &= (~JOB_COMPLETING);
}
continue;
}
if (node_ptr->node_state & NODE_STATE_NO_RESPOND)
continue;
(void) hostlist_push_host(kill_hostlist, node_ptr->name);
#ifdef HAVE_FRONT_END /* Operate only on front-end */
if (agent_args->node_count > 0)
continue;
#endif
hostlist_push(agent_args->hostlist, node_ptr->name);
agent_args->node_count++;
}
if (agent_args->node_count == 0) {
xfree(kill_job);
xfree(agent_args);
hostlist_destroy(kill_hostlist);
return;
}
hostlist_uniq(kill_hostlist);
hostlist_ranged_string(kill_hostlist,
sizeof(host_str), host_str);
#ifdef HAVE_BG
info("Resending TERMINATE_JOB request JobId=%u BPlist=%s",
job_ptr->job_id, host_str);
#else
info("Resending TERMINATE_JOB request JobId=%u Nodelist=%s",
job_ptr->job_id, host_str);
hostlist_destroy(kill_hostlist);
agent_args->msg_args = kill_job;
agent_queue_request(agent_args);