From b16d385bde1dc84e327e2dd05e4224508c497dc8 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 11 Dec 2008 18:32:54 +0000 Subject: [PATCH] The support of node-fail for a job allocation now seems to work as desired. --- src/slurmctld/step_mgr.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 7174f09c198..a4a3f4c8d16 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -486,7 +486,9 @@ _pick_step_nodes (struct job_record *job_ptr, * unused memory */ if (step_spec->exclusive) { int avail_cpus, avail_tasks, total_cpus, total_tasks, node_inx; + int i_first, i_last; uint32_t avail_mem, total_mem; + uint32_t nodes_picked_cnt = 0; uint32_t tasks_picked_cnt = 0, total_task_cnt = 0; bitstr_t *selected_nodes = NULL; @@ -516,11 +518,15 @@ _pick_step_nodes (struct job_record *job_ptr, } } - node_inx = 0; - for (i=bit_ffs(select_ptr->node_bitmap); i<node_record_count; - i++) { + node_inx = -1; + i_first = bit_ffs(select_ptr->node_bitmap); + i_last = bit_fls(select_ptr->node_bitmap); + for (i=i_first; i<=i_last; i++) { if (!bit_test(select_ptr->node_bitmap, i)) continue; + node_inx++; + if (!bit_test(nodes_avail, i)) + continue; /* node now DOWN */ avail_cpus = select_ptr->cpus[node_inx] - select_ptr->cpus_used[node_inx]; total_cpus = select_ptr->cpus[node_inx]; @@ -544,14 +550,15 @@ _pick_step_nodes (struct job_record *job_ptr, } if ((avail_tasks <= 0) || ((selected_nodes == NULL) && + (nodes_picked_cnt >= step_spec->node_count) && (tasks_picked_cnt > 0) && (tasks_picked_cnt >= step_spec->num_tasks))) bit_clear(nodes_avail, i); - else + else { + nodes_picked_cnt++; tasks_picked_cnt += avail_tasks; + } total_task_cnt += total_tasks; - if (++node_inx >= select_ptr->nhosts) - break; } if (selected_nodes) { @@ -1327,7 +1334,7 @@ extern slurm_step_layout_t *step_layout_create(struct step_record *step_ptr, for (i = first_bit; i <= last_bit; i++) { if (bit_test(step_ptr->step_node_bitmap, i)) { /* find out the position in the job */ - pos = bit_get_pos_num(job_ptr->node_bitmap, i); + pos = bit_get_pos_num(select_ptr->node_bitmap, i); if (pos == -1) return NULL; if (pos >= select_ptr->nhosts) @@ -1335,14 +1342,14 @@ extern slurm_step_layout_t *step_layout_create(struct step_record *step_ptr, if (step_ptr->exclusive) { usable_cpus = select_ptr->cpus[pos] - select_ptr->cpus_used[pos]; - usable_cpus = MAX(usable_cpus, - (num_tasks - set_tasks)); } else usable_cpus = select_ptr->cpus[pos]; if (step_ptr->mem_per_task) { usable_mem = select_ptr->memory_allocated[pos] - select_ptr->memory_used[pos]; usable_mem /= step_ptr->mem_per_task; + if (cpus_per_task > 0) + usable_mem *= cpus_per_task; usable_cpus = MIN(usable_cpus, usable_mem); } if (usable_cpus <= 0) { -- GitLab