From d9e07f7acfd3d184f22d6c2bef58e7fb9f8a3462 Mon Sep 17 00:00:00 2001 From: Alejandro Lucero Palau <alejandro.lucero@bsc.es> Date: Thu, 25 Oct 2012 15:20:51 -0700 Subject: [PATCH] hanges for core based reservation topology support --- src/plugins/select/cons_res/select_cons_res.c | 157 ++++++++++++++---- src/slurmctld/reservation.c | 36 ++-- testsuite/expect/inc3.11.1 | 1 - 3 files changed, 150 insertions(+), 44 deletions(-) diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 736ffef606d..8b7e0291078 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -2514,8 +2514,9 @@ bitstr_t *_make_core_bitmap_filtered(bitstr_t *node_map, int filter) return core_map; nodes = bit_size(node_map); - for (n = 0, c = 0; n < nodes; n++) { + for (n = 0; n < nodes; n++) { if (bit_test(node_map, n)) { + c = cr_get_coremap_offset(n); coff = cr_get_coremap_offset(n+1); while (c < coff) { bit_set(core_map, c++); @@ -2663,6 +2664,22 @@ bitstr_t *sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, return sp_avail_bitmap; } +static int _get_avail_core_in_node(bitstr_t *core_bitmap, int node) +{ + int coff; + int total_cores; + int i; + int avail = 0; + + coff = cr_get_coremap_offset(node); + total_cores = cr_node_num_cores[node]; + + for (i = 0; i < total_cores; i++) + if (!bit_test(core_bitmap, coff + i)) + avail++; + return avail; +} + /* * select_p_resv_test - Identify the nodes which "best" satisfy a reservation * request. "best" is defined as either single set of consecutive nodes @@ -2677,6 +2694,7 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, uint32_t core_cnt, bitstr_t **core_bitmap) { bitstr_t **switches_bitmap; /* nodes on this switch */ + bitstr_t **switches_core_bitmap; /* cores on this switch */ int *switches_cpu_cnt; /* total CPUs on switch */ int *switches_node_cnt; /* total nodes on switch */ int *switches_required; /* set if has required node */ @@ -2689,6 +2707,7 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, int best_fit_nodes; int best_fit_location = 0, best_fit_sufficient; bool sufficient; + int cores_per_node; xassert(avail_bitmap); @@ -2701,38 +2720,63 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, if (bit_set_count(avail_bitmap) < node_cnt) return avail_nodes_bitmap; + if (*core_bitmap == NULL) + *core_bitmap = _make_core_bitmap_filtered(avail_bitmap, 0); + rem_nodes = node_cnt; rem_cores = core_cnt; + /* TODO: allowing asymmetric cluster */ + cores_per_node = core_cnt / MAX(node_cnt, 1); + /* Construct a set of switch array entries, * use the same indexes as switch_record_table in slurmctld */ switches_bitmap = xmalloc(sizeof(bitstr_t *) * switch_record_cnt); + switches_core_bitmap = xmalloc(sizeof(bitstr_t *) * switch_record_cnt); switches_cpu_cnt = xmalloc(sizeof(int) * switch_record_cnt); switches_node_cnt = xmalloc(sizeof(int) * switch_record_cnt); switches_required = xmalloc(sizeof(int) * switch_record_cnt); for (i=0; i<switch_record_cnt; i++) { - bitstr_t *switch_bitmap_copy = xmalloc(sizeof(bitstr_t *)); - int node_cnt; + char str[100]; switches_bitmap[i] = bit_copy(switch_record_table[i]. node_bitmap); bit_and(switches_bitmap[i], avail_bitmap); switches_node_cnt[i] = bit_set_count(switches_bitmap[i]); - switch_bitmap_copy = bit_copy(switches_bitmap[i]); - node_cnt = switches_node_cnt[i]; - debug2("switch %d looking cores in %d nodes", - i, switches_node_cnt[i]); - while (node_cnt--) { - int node_inx; - node_inx = bit_ffs(switch_bitmap_copy); - switches_cpu_cnt[i] += cr_node_num_cores[node_inx]; - bit_nclear(switch_bitmap_copy, node_inx, node_inx); - } - - debug2("switch %d with %d nodes and %d cores", + + switches_core_bitmap[i] = + _make_core_bitmap_filtered(switches_bitmap[i], 1); + + if (*core_bitmap) { + bit_not(*core_bitmap); + bit_and(switches_core_bitmap[i], *core_bitmap); + bit_not(*core_bitmap); + } + bit_fmt(str, sizeof(str), switches_core_bitmap[i]); + debug2("Switch %d can use cores: %s", i, str); + + switches_cpu_cnt[i] = bit_set_count(switches_core_bitmap[i]); + + debug2("switch %d looking cores in %d nodes (%d)", i, switches_node_cnt[i], switches_cpu_cnt[i]); } + /* Let's check nodes with less avail cores than needed */ + + for (j=0; j<switch_record_cnt; j++) { + first = bit_ffs(switches_bitmap[j]); + last = bit_fls(switches_bitmap[j]); + for (i=first; ((i<=last) && (first>=0)); i++) { + int c; + if (!bit_test(switches_bitmap[j], i)) + continue; + + c = _get_avail_core_in_node(*core_bitmap, i); + if (c < cores_per_node) + switches_node_cnt[j] -= c; + } + } + #if SELECT_DEBUG /* Don't compile this, it slows things down too much */ for (i=0; i<switch_record_cnt; i++) { @@ -2781,6 +2825,7 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, /* Select resources from these leafs on a best-fit basis */ avail_nodes_bitmap = bit_alloc(node_record_count); while (rem_nodes > 0) { + int avail_cores_in_node; best_fit_nodes = best_fit_sufficient = 0; for (j=0; j<switch_record_cnt; j++) { if (switches_node_cnt[j] == 0) @@ -2810,7 +2855,7 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, for (i=first; ((i<=last) && (first>=0)); i++) { if (!bit_test(switches_bitmap[best_fit_location], i)){ continue; - } + } bit_clear(switches_bitmap[best_fit_location], i); switches_node_cnt[best_fit_location]--; @@ -2821,9 +2866,26 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, continue; } + if (*core_bitmap) { + int coff; + avail_cores_in_node = 0; + coff = cr_get_coremap_offset(i); + debug2("Testing node %d, core offset %d", + i, coff); + for (j=0; j<cr_node_num_cores[i]; j++){ + if (!bit_test(*core_bitmap, coff + j)) + avail_cores_in_node++; + } + if (avail_cores_in_node < cores_per_node) + continue; + + debug2("Using node %d with %d cores available", + i, avail_cores_in_node); + } + bit_set(avail_nodes_bitmap, i); if (core_cnt) - rem_cores -= cr_node_num_cores[i]; + rem_cores -= cores_per_node; if (--rem_nodes <= 0) break; } @@ -2832,31 +2894,44 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, if (rem_nodes > 0) /* insufficient resources */ FREE_NULL_BITMAP(avail_nodes_bitmap); -fini: for (i=0; i<switch_record_cnt; i++) +fini: for (i=0; i<switch_record_cnt; i++) { FREE_NULL_BITMAP(switches_bitmap[i]); + FREE_NULL_BITMAP(switches_core_bitmap[i]); + } xfree(switches_bitmap); + xfree(switches_core_bitmap); xfree(switches_cpu_cnt); xfree(switches_node_cnt); xfree(switches_required); - if (core_cnt) { /* Reservation is using partial nodes */ + if (avail_nodes_bitmap && core_cnt) { + /* Reservation is using partial nodes */ //char str[100]; - int cores_per_node; + bitstr_t *exc_core_bitmap = NULL; sp_avail_bitmap = bit_alloc(bit_size(avail_bitmap)); if (sp_avail_bitmap == NULL) fatal ("memory allocation failure"); - *core_bitmap = _make_core_bitmap_filtered(avail_bitmap, 0); + if (*core_bitmap) { + exc_core_bitmap = bit_alloc(bit_size(*core_bitmap)); + if (!exc_core_bitmap) + fatal("bit_alloc: malloc failure"); + } else { + error("select_p_resv_test: core_bitmap is NULL"); + FREE_NULL_BITMAP(sp_avail_bitmap); + return NULL; + } cores_per_node = core_cnt / MAX(node_cnt, 1); while (core_cnt) { uint32_t inx, coff; int i; + int avail_cores_in_node; - inx = bit_ffs(avail_bitmap); + inx = bit_ffs(avail_nodes_bitmap); if ((inx < 0) || (inx > bit_size(avail_bitmap))) break; @@ -2864,19 +2939,43 @@ fini: for (i=0; i<switch_record_cnt; i++) "core_cnt: %d", inx, cores_per_node, core_cnt); coff = cr_get_coremap_offset(inx); - for (i = 0; i < cores_per_node; i++){ - /* TODO: checking cores_per_nodes is lower - * than real cores per node */ - bit_set(*core_bitmap, coff++); - core_cnt--; + /* Clear this node from the initial available bitmap */ + bit_clear(avail_nodes_bitmap, inx); + + if (cr_node_num_cores[inx] < cores_per_node) + continue; + + avail_cores_in_node = 0; + for (i = 0; i < cr_node_num_cores[inx]; i++) { + if (!bit_test(exc_core_bitmap, coff + i)) { + avail_cores_in_node++; + } + } + + debug2("Node %d has %d available cores", inx, + avail_cores_in_node); + + if (avail_cores_in_node < cores_per_node) + continue; + + avail_cores_in_node = 0; + for (i = 0; i < cr_node_num_cores[inx]; i++) { + if (!bit_test(exc_core_bitmap, coff + i)) { + bit_set(*core_bitmap, coff + i); + core_cnt--; + avail_cores_in_node++; + } + + if ((avail_cores_in_node == cores_per_node) || + (core_cnt == 0)) + break; } /* Add this node to the final node bitmap */ bit_set(sp_avail_bitmap, inx); - /* Clear this node from the initial available bitmap */ - bit_clear(avail_bitmap, inx); } + FREE_NULL_BITMAP(exc_core_bitmap); //bit_fmt(str, (sizeof(str) - 1), *core_bitmap); //info("sequential pick using coremap: %s", str); diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 8a73084b696..ccfcf961c21 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -1783,6 +1783,10 @@ extern int update_resv(resv_desc_msg_t *resv_desc_ptr) if (!resv_ptr) return ESLURM_RESERVATION_INVALID; + /* TODO: core based reservation updates */ + if (resv_ptr->full_nodes == 0) + return ESLURM_RESERVATION_NOT_USABLE; + /* Make backup to restore state in case of failure */ resv_backup = _copy_resv(resv_ptr); @@ -2462,6 +2466,7 @@ static void _validate_all_reservations(void) static void _validate_node_choice(slurmctld_resv_t *resv_ptr) { bitstr_t *tmp_bitmap = NULL; + bitstr_t *core_bitmap = NULL; int i; resv_desc_msg_t resv_desc; @@ -2482,7 +2487,8 @@ static void _validate_node_choice(slurmctld_resv_t *resv_ptr) resv_desc.features = resv_ptr->features; resv_desc.node_cnt = xmalloc(sizeof(uint32_t) * 2); resv_desc.node_cnt[0]= resv_ptr->node_cnt - i; - i = _select_nodes(&resv_desc, &resv_ptr->part_ptr, &tmp_bitmap, NULL); + i = _select_nodes(&resv_desc, &resv_ptr->part_ptr, &tmp_bitmap, + &core_bitmap); xfree(resv_desc.node_cnt); xfree(resv_desc.node_list); xfree(resv_desc.partition); @@ -2490,6 +2496,8 @@ static void _validate_node_choice(slurmctld_resv_t *resv_ptr) bit_and(resv_ptr->node_bitmap, avail_node_bitmap); bit_or(resv_ptr->node_bitmap, tmp_bitmap); FREE_NULL_BITMAP(tmp_bitmap); + FREE_NULL_BITMAP(resv_ptr->core_bitmap); + resv_ptr->core_bitmap = core_bitmap; xfree(resv_ptr->node_list); resv_ptr->node_list = bitmap2node_name(resv_ptr->node_bitmap); info("modified reservation %s due to unusable nodes, " @@ -2691,6 +2699,7 @@ extern int validate_job_resv(struct job_record *job_ptr) static int _resize_resv(slurmctld_resv_t *resv_ptr, uint32_t node_cnt) { bitstr_t *tmp1_bitmap = NULL, *tmp2_bitmap = NULL; + bitstr_t *core_bitmap = NULL; int delta_node_cnt, i; resv_desc_msg_t resv_desc; @@ -2746,13 +2755,16 @@ static int _resize_resv(slurmctld_resv_t *resv_ptr, uint32_t node_cnt) resv_desc.flags = resv_ptr->flags; resv_desc.node_cnt = xmalloc(sizeof(uint32_t) * 2); resv_desc.node_cnt[0]= 0 - delta_node_cnt; - i = _select_nodes(&resv_desc, &resv_ptr->part_ptr, &tmp1_bitmap, NULL); + i = _select_nodes(&resv_desc, &resv_ptr->part_ptr, &tmp1_bitmap, + &core_bitmap); xfree(resv_desc.node_cnt); xfree(resv_desc.node_list); xfree(resv_desc.partition); if (i == SLURM_SUCCESS) { bit_or(resv_ptr->node_bitmap, tmp1_bitmap); FREE_NULL_BITMAP(tmp1_bitmap); + FREE_NULL_BITMAP(resv_ptr->core_bitmap); + resv_ptr->core_bitmap = core_bitmap; xfree(resv_ptr->node_list); resv_ptr->node_list = bitmap2node_name(resv_ptr->node_bitmap); resv_ptr->node_cnt = node_cnt; @@ -3515,7 +3527,8 @@ extern int job_test_resv(struct job_record *job_ptr, time_t *when, (res2_ptr == resv_ptr) || (res2_ptr->node_bitmap == NULL) || (res2_ptr->start_time >= job_end_time) || - (res2_ptr->end_time <= job_start_time)) + (res2_ptr->end_time <= job_start_time) || + (!res2_ptr->full_nodes)) continue; bit_not(res2_ptr->node_bitmap); bit_and(*node_bitmap, res2_ptr->node_bitmap); @@ -3564,10 +3577,10 @@ extern int job_test_resv(struct job_record *job_ptr, time_t *when, (resv_ptr->end_time <= job_start_time)) continue; if (job_ptr->details->req_node_bitmap && - bit_overlap(job_ptr->details->req_node_bitmap, - resv_ptr->node_bitmap) && - ((resv_ptr->cpu_cnt == 0) || - (!job_ptr->details->shared))) { + bit_overlap(job_ptr->details->req_node_bitmap, + resv_ptr->node_bitmap) && + ((resv_ptr->cpu_cnt == 0) || + (!job_ptr->details->shared))) { *when = resv_ptr->end_time; rc = ESLURM_NODES_BUSY; break; @@ -3596,13 +3609,8 @@ extern int job_test_resv(struct job_record *job_ptr, time_t *when, *exc_core_bitmap = bit_copy(resv_ptr->core_bitmap); } else { - char str[100]; - bit_and(*exc_core_bitmap, - resv_ptr->core_bitmap); - bit_fmt(str, (sizeof(str) - 1), - *exc_core_bitmap); - debug2("New exclude core bitmap %s", - str); + bit_or(*exc_core_bitmap, + resv_ptr->core_bitmap); } } } diff --git a/testsuite/expect/inc3.11.1 b/testsuite/expect/inc3.11.1 index 7d8e7d90093..c62798eb6bb 100644 --- a/testsuite/expect/inc3.11.1 +++ b/testsuite/expect/inc3.11.1 @@ -59,7 +59,6 @@ proc inc3_11_1 {} { {StartTime=now Duration=5 Nodes=$def_node User=$user_name Flags=badtype,ignore_jobs} {StartTime=now+10minutes EndTime=now Nodes=$def_node User=$user_name Flags=ignore_jobs} {StartTime=now Duration=5 Nodes=$def_node User=$user_name Licenses=DUMMY_FOR_TESTING Flags=ignore_jobs} - #{StartTime=now Duration=5 NodeCnt=2 CoreCnt=1 User=$user_name} {StartTime=now Duration=5 NodeCnt=1 CoreCnt=$core_res_num User=$user_name} " #{StartTime=now Duration=5 Nodes=$def_node Account=badaccountname} -- GitLab