From 903b5654f27f76565bba1b2ef4ff33de4033be90 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Tue, 15 Jan 2013 15:19:27 -0800 Subject: [PATCH] Correct gres logic to handle difference in core/cpu count The gres_plugin_job_test was returning a count of cores available to a job, but the select plugins was treating this as a CPU count. This change converts the core count into a CPU count as needed in the select plugin and changes the comments related to the function gres_plugin_job_test(). --- src/common/gres.c | 4 +-- src/common/gres.h | 4 +-- src/plugins/select/cons_res/job_test.c | 36 ++++++++++++++-------- src/plugins/select/linear/select_linear.c | 16 +++++++--- src/plugins/select/serial/job_test.c | 37 +++++++++++++++-------- 5 files changed, 63 insertions(+), 34 deletions(-) diff --git a/src/common/gres.c b/src/common/gres.c index a1155575cce..fa321506b4f 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -2681,8 +2681,8 @@ extern uint32_t _job_test(void *job_gres_data, void *node_gres_data, * IN cpu_end_bit - index into cpu_bitmap for this node's last CPU * IN job_id - job's ID (for logging) * IN node_name - name of the node (for logging) - * RET: NO_VAL - All CPUs on node are available - * otherwise - Specific CPU count + * RET: NO_VAL - All cores on node are available + * otherwise - Count of available cores */ extern uint32_t gres_plugin_job_test(List job_gres_list, List node_gres_list, bool use_total_gres, bitstr_t *cpu_bitmap, diff --git a/src/common/gres.h b/src/common/gres.h index 525998ef29b..65d2470c8ac 100644 --- a/src/common/gres.h +++ b/src/common/gres.h @@ -374,8 +374,8 @@ extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer, * IN cpu_end_bit - index into cpu_bitmap for this node's last CPU * IN job_id - job's ID (for logging) * IN node_name - name of the node (for logging) - * RET: NO_VAL - All CPUs on node are available - * otherwise - Specific CPU count + * RET: NO_VAL - All cores on node are available + * otherwise - Count of available cores */ extern uint32_t gres_plugin_job_test(List job_gres_list, List node_gres_list, bool use_total_gres, bitstr_t *cpu_bitmap, diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index a31d3520aab..1ce0972a458 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -588,7 +588,7 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, bool test_only) { uint16_t cpus; - uint32_t avail_mem, req_mem, gres_cpus; + uint32_t avail_mem, req_mem, gres_cores, gres_cpus, cpus_per_core; int core_start_bit, core_end_bit, cpu_alloc_size; struct node_record *node_ptr = node_record_table_ptr + node_i; List gres_list; @@ -614,6 +614,8 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, } core_start_bit = cr_get_coremap_offset(node_i); core_end_bit = cr_get_coremap_offset(node_i+1) - 1; + cpus_per_core = select_node_record[node_i].cpus / + (core_end_bit - core_start_bit + 1); node_ptr = select_node_record[node_i].node_ptr; if (cr_type & CR_MEMORY) { @@ -645,11 +647,14 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, gres_list = node_usage[node_i].gres_list; else gres_list = node_ptr->gres_list; - gres_cpus = gres_plugin_job_test(job_ptr->gres_list, - gres_list, test_only, - core_map, core_start_bit, - core_end_bit, job_ptr->job_id, - node_ptr->name); + gres_cores = gres_plugin_job_test(job_ptr->gres_list, + gres_list, test_only, + core_map, core_start_bit, + core_end_bit, job_ptr->job_id, + node_ptr->name); + gres_cpus = gres_cores; + if (gres_cpus != NO_VAL) + gres_cpus *= cpus_per_core; if ((gres_cpus < job_ptr->details->ntasks_per_node) || ((job_ptr->details->cpus_per_task > 1) && (gres_cpus < job_ptr->details->cpus_per_task))) @@ -729,7 +734,8 @@ static int _verify_node_state(struct part_res_record *cr_part_ptr, enum node_cr_state job_node_req) { struct node_record *node_ptr; - uint32_t i, free_mem, gres_cpus, min_mem, size; + uint32_t i, free_mem, gres_cpus, gres_cores, min_mem, size; + int core_start_bit, core_end_bit, cpus_per_core; List gres_list; if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { @@ -748,7 +754,10 @@ static int _verify_node_state(struct part_res_record *cr_part_ptr, if (!bit_test(bitmap, i)) continue; node_ptr = select_node_record[i].node_ptr; - + core_start_bit = cr_get_coremap_offset(i); + core_end_bit = cr_get_coremap_offset(i+1) - 1; + cpus_per_core = select_node_record[i].cpus / + (core_end_bit - core_start_bit + 1); /* node-level memory check */ if ((job_ptr->details->pn_min_memory) && (cr_type & CR_MEMORY)) { @@ -771,10 +780,13 @@ static int _verify_node_state(struct part_res_record *cr_part_ptr, gres_list = node_usage[i].gres_list; else gres_list = node_ptr->gres_list; - gres_cpus = gres_plugin_job_test(job_ptr->gres_list, - gres_list, true, - NULL, 0, 0, job_ptr->job_id, - node_ptr->name); + gres_cores = gres_plugin_job_test(job_ptr->gres_list, + gres_list, true, + NULL, 0, 0, job_ptr->job_id, + node_ptr->name); + gres_cpus = gres_cores; + if (gres_cpus != NO_VAL) + gres_cpus *= cpus_per_core; if (gres_cpus == 0) { debug3("cons_res: _vns: node %s lacks gres", node_ptr->name); diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 116c923f76a..f353f68a5a5 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -635,7 +635,8 @@ static int _job_count_bitmap(struct cr_record *cr_ptr, struct node_record *node_ptr; uint32_t job_memory_cpu = 0, job_memory_node = 0; uint32_t alloc_mem = 0, job_mem = 0, avail_mem = 0; - uint32_t cpu_cnt, gres_cpus; + uint32_t cpu_cnt, gres_cpus, gres_cores; + int core_start_bit, core_end_bit, cpus_per_core; List gres_list; bool use_total_gres = true; @@ -675,11 +676,16 @@ static int _job_count_bitmap(struct cr_record *cr_ptr, gres_list = cr_ptr->nodes[i].gres_list; else gres_list = node_ptr->gres_list; - gres_cpus = gres_plugin_job_test(job_ptr->gres_list, - gres_list, use_total_gres, - NULL, 0, 0, job_ptr->job_id, - node_ptr->name); + core_start_bit = cr_get_coremap_offset(i); + core_end_bit = cr_get_coremap_offset(i+1) - 1; + cpus_per_core = cpu_cnt / (core_end_bit - core_start_bit + 1); + gres_cores = gres_plugin_job_test(job_ptr->gres_list, + gres_list, use_total_gres, + NULL, 0, 0, job_ptr->job_id, + node_ptr->name); + gres_cpus = gres_cores; if (gres_cpus != NO_VAL) { + gres_cpus *= cpus_per_core; if ((gres_cpus < cpu_cnt) || (gres_cpus < job_ptr->details->ntasks_per_node) || ((job_ptr->details->cpus_per_task > 1) && diff --git a/src/plugins/select/serial/job_test.c b/src/plugins/select/serial/job_test.c index 997360e1923..78cf7e41e48 100644 --- a/src/plugins/select/serial/job_test.c +++ b/src/plugins/select/serial/job_test.c @@ -101,7 +101,7 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, bool test_only) { uint16_t cpus; - uint32_t avail_mem, req_mem, gres_cpus; + uint32_t avail_mem, req_mem, gres_cpus, gres_cores, cpus_per_core; int core_start_bit, core_end_bit; struct node_record *node_ptr = node_record_table_ptr + node_i; List gres_list; @@ -117,7 +117,8 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, core_start_bit = cr_get_coremap_offset(node_i); core_end_bit = cr_get_coremap_offset(node_i + 1) - 1; node_ptr = select_node_record[node_i].node_ptr; - + cpus_per_core = select_node_record[node_i].cpus / + (core_end_bit - core_start_bit + 1); if ((cr_type & CR_MEMORY) && cpus) { req_mem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU; avail_mem = select_node_record[node_i].real_memory; @@ -131,11 +132,14 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, gres_list = node_usage[node_i].gres_list; else gres_list = node_ptr->gres_list; - gres_cpus = gres_plugin_job_test(job_ptr->gres_list, - gres_list, test_only, - core_map, core_start_bit, - core_end_bit, job_ptr->job_id, - node_ptr->name); + gres_cores = gres_plugin_job_test(job_ptr->gres_list, + gres_list, test_only, + core_map, core_start_bit, + core_end_bit, job_ptr->job_id, + node_ptr->name); + gres_cpus = gres_cores; + if (gres_cpus != NO_VAL) + gres_cpus *= cpus_per_core; if ((gres_cpus < job_ptr->details->ntasks_per_node) || ((job_ptr->details->cpus_per_task > 1) && (gres_cpus < job_ptr->details->cpus_per_task))) @@ -215,8 +219,9 @@ static int _verify_node_state(struct part_res_record *cr_part_ptr, enum node_cr_state job_node_req) { struct node_record *node_ptr; - uint32_t i, free_mem, gres_cpus, min_mem; + uint32_t i, free_mem, gres_cpus, gres_cores, min_mem; int i_first, i_last; + int core_start_bit, core_end_bit, cpus_per_core; List gres_list; if (job_ptr->details->pn_min_memory & MEM_PER_CPU) @@ -232,7 +237,10 @@ static int _verify_node_state(struct part_res_record *cr_part_ptr, if (!bit_test(bitmap, i)) continue; node_ptr = select_node_record[i].node_ptr; - + core_start_bit = cr_get_coremap_offset(i); + core_end_bit = cr_get_coremap_offset(i+1) - 1; + cpus_per_core = select_node_record[i].cpus / + (core_end_bit - core_start_bit + 1); /* node-level memory check */ if ((job_ptr->details->pn_min_memory) && (cr_type & CR_MEMORY)) { @@ -251,10 +259,13 @@ static int _verify_node_state(struct part_res_record *cr_part_ptr, gres_list = node_usage[i].gres_list; else gres_list = node_ptr->gres_list; - gres_cpus = gres_plugin_job_test(job_ptr->gres_list, - gres_list, true, - NULL, 0, 0, job_ptr->job_id, - node_ptr->name); + gres_cores = gres_plugin_job_test(job_ptr->gres_list, + gres_list, true, + NULL, 0, 0, job_ptr->job_id, + node_ptr->name); + gres_cpus = gres_cores; + if (gres_cpus != NO_VAL) + gres_cpus *= cpus_per_core; if (gres_cpus == 0) { debug3("select/serial: node %s lacks gres", node_ptr->name); -- GitLab