diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 20bb6b9094d14c33d7f2658acb98cfb102162199..cfe3e7d121217f896fb833f74d117b175d9cdcaf 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -146,7 +146,7 @@ static bool _test_resv_overlap(node_space_map_t *node_space, uint32_t end_reserve); static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, uint32_t min_nodes, uint32_t max_nodes, - uint32_t req_nodes); + uint32_t req_nodes, bitstr_t *exc_core_bitmap); /* Log resource allocate table */ static void _dump_node_space_table(node_space_map_t *node_space_ptr) @@ -240,11 +240,12 @@ static int _num_feature_count(struct job_record *job_ptr) /* Attempt to schedule a specific job on specific available nodes * IN job_ptr - job to schedule * IN/OUT avail_bitmap - nodes available/selected to use + * IN exc_core_bitmap - cores which can not be used * RET SLURM_SUCCESS on success, otherwise an error code */ static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, uint32_t min_nodes, uint32_t max_nodes, - uint32_t req_nodes) + uint32_t req_nodes, bitstr_t *exc_core_bitmap) { bitstr_t *tmp_bitmap; int rc = SLURM_SUCCESS; @@ -289,7 +290,7 @@ static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, rc = select_g_job_test(job_ptr, *avail_bitmap, high_cnt, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, - preemptee_candidates, NULL); + preemptee_candidates, NULL, exc_core_bitmap); } /* Restore the feature counts */ @@ -306,15 +307,25 @@ static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, * then on shared nodes (if so configured). */ uint16_t orig_shared; time_t now = time(NULL); + char str[100]; + preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); orig_shared = job_ptr->details->shared; job_ptr->details->shared = 0; tmp_bitmap = bit_copy(*avail_bitmap); + + if(exc_core_bitmap){ + bit_fmt(str, (sizeof(str) - 1), exc_core_bitmap); + debug2(" _try_sched with exclude core bitmap: %s", str); + } + rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, - preemptee_candidates, NULL); + preemptee_candidates, NULL, exc_core_bitmap); + job_ptr->details->shared = orig_shared; + if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) && (orig_shared != 0)) { FREE_NULL_BITMAP(*avail_bitmap); @@ -322,7 +333,7 @@ static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, - preemptee_candidates, NULL); + preemptee_candidates, NULL, exc_core_bitmap); } else FREE_NULL_BITMAP(tmp_bitmap); } @@ -502,6 +513,7 @@ static int _attempt_backfill(void) uint32_t time_limit, comp_time_limit, orig_time_limit; uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; + bitstr_t *exc_core_bitmap = NULL; time_t now, sched_start, later_start, start_res; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; @@ -682,7 +694,8 @@ static int _attempt_backfill(void) TRY_LATER: FREE_NULL_BITMAP(avail_bitmap); start_res = later_start; later_start = 0; - j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap); + exc_core_bitmap = NULL; + j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; continue; @@ -770,9 +783,7 @@ static int _attempt_backfill(void) slurmctld_diag_stats.bf_last_depth_try++; j = _try_sched(job_ptr, &avail_bitmap, - min_nodes, max_nodes, req_nodes); - debug2("backfill: finished _try_sched for job %u.", - job_ptr->job_id); + min_nodes, max_nodes, req_nodes, exc_core_bitmap); now = time(NULL); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; diff --git a/src/plugins/sched/builtin/builtin.c b/src/plugins/sched/builtin/builtin.c index 377a6f1a059e6a45127a5a74a80b9f8fe85de3a9..ea3fca194baa690f6dcd7991347082bf2c4ab463 100644 --- a/src/plugins/sched/builtin/builtin.c +++ b/src/plugins/sched/builtin/builtin.c @@ -138,6 +138,7 @@ static void _compute_start_times(void) struct job_record *job_ptr; struct part_record *part_ptr; bitstr_t *alloc_bitmap = NULL, *avail_bitmap = NULL; + bitstr_t *exc_core_bitmap = NULL; uint32_t max_nodes, min_nodes, req_nodes, time_limit; time_t now = time(NULL), sched_start, last_job_alloc; @@ -184,14 +185,14 @@ static void _compute_start_times(void) continue; } - j = job_test_resv(job_ptr, &now, true, &avail_bitmap); + j = job_test_resv(job_ptr, &now, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) continue; rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, - preemptee_candidates, NULL); + preemptee_candidates, NULL, exc_core_bitmap); if (rc == SLURM_SUCCESS) { last_job_update = now; if (job_ptr->time_limit == INFINITE) diff --git a/src/plugins/sched/wiki2/job_will_run.c b/src/plugins/sched/wiki2/job_will_run.c index 48e3fbf1efeb83caa6b16a498cc40fc53381e30d..c6bbab5b9dce288b6038363afc458985646c12ca 100644 --- a/src/plugins/sched/wiki2/job_will_run.c +++ b/src/plugins/sched/wiki2/job_will_run.c @@ -129,6 +129,7 @@ static char * _will_run_test(uint32_t jobid, time_t start_time, struct job_record *job_ptr = NULL; struct part_record *part_ptr; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; + bitstr_t *exc_core_bitmap = NULL; char *hostlist, *reply_msg = NULL; uint32_t min_nodes, max_nodes, req_nodes; int rc; @@ -173,7 +174,7 @@ static char * _will_run_test(uint32_t jobid, time_t start_time, /* Enforce reservation: access control, time and nodes */ start_res = start_time; - rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap); + rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap, &exc_core_bitmap); if (rc != SLURM_SUCCESS) { *err_code = -730; *err_msg = "Job denied access to reservation"; @@ -253,7 +254,7 @@ static char * _will_run_test(uint32_t jobid, time_t start_time, rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, - preemptee_candidates, NULL); + preemptee_candidates, NULL, exc_core_bitmap); if (preemptee_candidates) list_destroy(preemptee_candidates); @@ -412,6 +413,7 @@ static char * _will_run_test2(uint32_t jobid, time_t start_time, struct job_record *job_ptr = NULL, *pre_ptr; struct part_record *part_ptr; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; + bitstr_t *exc_core_bitmap = NULL; time_t start_res; uint32_t min_nodes, max_nodes, req_nodes; List preemptee_candidates = NULL, preempted_jobs = NULL; @@ -455,7 +457,7 @@ static char * _will_run_test2(uint32_t jobid, time_t start_time, /* Enforce reservation: access control, time and nodes */ start_res = start_time; - rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap); + rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap, &exc_core_bitmap); if (rc != SLURM_SUCCESS) { *err_code = -730; *err_msg = "Job denied access to reservation"; @@ -540,7 +542,7 @@ static char * _will_run_test2(uint32_t jobid, time_t start_time, orig_start_time = job_ptr->start_time; rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, - preemptee_candidates, &preempted_jobs); + preemptee_candidates, &preempted_jobs, exc_core_bitmap); if (preemptee_candidates) list_destroy(preemptee_candidates); diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index 6da83734426243cb788005c3a49136d8464f59f0..04abf102124ec8c3fedcf78b5ff846e1efacdf49 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -3132,7 +3132,7 @@ extern int select_p_reconfigure(void) #endif } -extern bitstr_t *select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) +extern bitstr_t *select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, bitstr_t **core_bitmap) { #ifdef HAVE_BG /* Reserve a block of appropriate geometry by issuing a fake job diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index d401fa4d68e871df17bb11b8038bd7397bf95b8e..e930c2efe927cdd17625ab6fd9c91b51dd23f554 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -1877,11 +1877,17 @@ static uint16_t *_select_nodes(struct job_record *job_ptr, uint32_t min_nodes, int rc; uint16_t *cpu_cnt, *cpus = NULL; uint32_t start, n, a; + //char str[100]; bitstr_t *req_map = job_ptr->details->req_node_bitmap; if (bit_set_count(node_map) < min_nodes) return NULL; + //bit_fmt(str, (sizeof(str) - 1), node_map); + //info("ALEJ: _select_nodes nodemap: %s", str); + //bit_fmt(str, (sizeof(str) - 1), core_map); + //info("ALEJ: _select_nodes coremap: %s", str); + /* get resource usage for this job from each available node */ _get_res_usage(job_ptr, node_map, core_map, cr_node_cnt, node_usage, cr_type, &cpu_cnt, test_only); @@ -1903,6 +1909,9 @@ static uint16_t *_select_nodes(struct job_record *job_ptr, uint32_t min_nodes, xfree(cpu_cnt); return NULL; } + + //bit_fmt(str, (sizeof(str) - 1), node_map); + //info("ALEJ: _select_nodes nodemap: %s", str); /* choose the best nodes for the job */ rc = _choose_nodes(job_ptr, node_map, min_nodes, max_nodes, req_nodes, @@ -1955,7 +1964,7 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, uint16_t cr_type, enum node_cr_state job_node_req, uint32_t cr_node_cnt, struct part_res_record *cr_part_ptr, - struct node_use_record *node_usage) + struct node_use_record *node_usage, bitstr_t *exc_core_bitmap) { int error_code = SLURM_SUCCESS, ll; /* ll = layout array index */ uint16_t *layout_ptr = NULL; @@ -2095,6 +2104,17 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); + if(exc_core_bitmap){ + char str[100]; + + bit_fmt(str, (sizeof(str) - 1), exc_core_bitmap); + debug2("excluding cores reserved: %s", str); + + bit_not(exc_core_bitmap); + bit_and(free_cores, exc_core_bitmap); + bit_not(exc_core_bitmap); + } + /* remove all existing allocations from free_cores */ tmpcore = bit_copy(free_cores); for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { @@ -2111,6 +2131,7 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); + if ((cpu_count) && (job_ptr->best_switch)) { /* job fits! We're done. */ if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { @@ -2141,6 +2162,12 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); + if(exc_core_bitmap){ + bit_not(exc_core_bitmap); + bit_and(free_cores, exc_core_bitmap); + bit_not(exc_core_bitmap); + } + for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) { if (jp_ptr->part_ptr == job_ptr->part_ptr) break; diff --git a/src/plugins/select/cons_res/job_test.h b/src/plugins/select/cons_res/job_test.h index 38a28fe6bd61bbab093f99a7fe61540f8ae41610..4a2ad6b5e480538afa0b31d535d450403bf898e4 100644 --- a/src/plugins/select/cons_res/job_test.h +++ b/src/plugins/select/cons_res/job_test.h @@ -66,6 +66,6 @@ int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, int mode, uint16_t cr_type, enum node_cr_state job_node_req, uint32_t cr_node_cnt, struct part_res_record *cr_part_ptr, - struct node_use_record *node_usage); + struct node_use_record *node_usage, bitstr_t *exc_core_bitmap); #endif /* !_CR_JOB_TEST_H */ diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 081cdb96a434507dd9290cc972cafa5003aed5f4..07b35069fd5e998f5b7b07ed3904ee62bfcb7ec9 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -207,7 +207,7 @@ static int _rm_job_from_res(struct part_res_record *part_record_ptr, static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t job_node_req, - List preemptee_candidates, List *preemptee_job_list); + List preemptee_candidates, List *preemptee_job_list, bitstr_t *exc_core_bitmap); static int _sort_usable_nodes_dec(struct job_record *job_a, struct job_record *job_b); static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap, @@ -216,7 +216,7 @@ static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap, static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t job_node_req, - List preemptee_candidates, List *preemptee_job_list); + List preemptee_candidates, List *preemptee_job_list, bitstr_t *exc_core_bitmap); static void _dump_job_res(struct job_resources *job) { char str[64]; @@ -1497,7 +1497,7 @@ static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap, rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_TEST_ONLY, cr_type, job_node_req, select_node_cnt, select_part_record, - select_node_usage); + select_node_usage, NULL); return rc; } @@ -1520,7 +1520,7 @@ static int _sort_usable_nodes_dec(struct job_record *job_a, static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t job_node_req, - List preemptee_candidates, List *preemptee_job_list) + List preemptee_candidates, List *preemptee_job_list, bitstr_t *exc_core_bitmap) { int rc; bitstr_t *orig_map = NULL, *save_bitmap; @@ -1540,7 +1540,7 @@ top: orig_map = bit_copy(save_bitmap); rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_RUN_NOW, cr_type, job_node_req, select_node_cnt, select_part_record, - select_node_usage); + select_node_usage, exc_core_bitmap); if ((rc != SLURM_SUCCESS) && preemptee_candidates) { /* Remove preemptable jobs from simulated environment */ @@ -1580,7 +1580,7 @@ top: orig_map = bit_copy(save_bitmap); SELECT_MODE_WILL_RUN, cr_type, job_node_req, select_node_cnt, - future_part, future_usage); + future_part, future_usage, exc_core_bitmap); tmp_job_ptr->details->usable_nodes = 0; /* * If successful, set the last job's usable count to a @@ -1666,7 +1666,7 @@ top: orig_map = bit_copy(save_bitmap); static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t job_node_req, - List preemptee_candidates, List *preemptee_job_list) + List preemptee_candidates, List *preemptee_job_list, bitstr_t *exc_core_bitmap) { struct part_res_record *future_part; struct node_use_record *future_usage; @@ -1685,7 +1685,7 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, cr_type, job_node_req, select_node_cnt, select_part_record, - select_node_usage); + select_node_usage, exc_core_bitmap); if (rc == SLURM_SUCCESS) { FREE_NULL_BITMAP(orig_map); job_ptr->start_time = time(NULL); @@ -1743,7 +1743,7 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, cr_type, job_node_req, select_node_cnt, future_part, - future_usage); + future_usage, exc_core_bitmap); if (rc == SLURM_SUCCESS) job_ptr->start_time = now + 1; } @@ -1769,7 +1769,7 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, cr_type, job_node_req, select_node_cnt, - future_part, future_usage); + future_part, future_usage, exc_core_bitmap); if (rc == SLURM_SUCCESS) { if (tmp_job_ptr->end_time <= now) job_ptr->start_time = now + 1; @@ -1993,7 +1993,7 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t mode, List preemptee_candidates, - List *preemptee_job_list) + List *preemptee_job_list, bitstr_t *exc_core_bitmap) { int rc = EINVAL; uint16_t job_node_req; @@ -2001,6 +2001,7 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, xassert(bitmap); + debug2("select_p_job_test for job %u", job_ptr->job_id); if (!debug_check) { debug_check = true; if (slurm_get_debug_flags() & DEBUG_FLAG_CPU_BIND) @@ -2025,14 +2026,14 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, if (mode == SELECT_MODE_WILL_RUN) { rc = _will_run_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, job_node_req, - preemptee_candidates, preemptee_job_list); + preemptee_candidates, preemptee_job_list, exc_core_bitmap); } else if (mode == SELECT_MODE_TEST_ONLY) { rc = _test_only(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, job_node_req); } else if (mode == SELECT_MODE_RUN_NOW) { rc = _run_now(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, job_node_req, - preemptee_candidates, preemptee_job_list); + preemptee_candidates, preemptee_job_list, exc_core_bitmap); } else fatal("select_p_job_test: Mode %d is invalid", mode); @@ -2519,6 +2520,123 @@ extern int select_p_reconfigure(void) return SLURM_SUCCESS; } +/* given an "avail" node_bitmap, return a corresponding "avail" core_bitmap */ +/* DUPLICATE CODE: see job_test.c */ +/* Adding a filter for setting cores based on avail bitmap */ +bitstr_t *_make_core_bitmap_filtered(bitstr_t *node_map, int filter) +{ + uint32_t n, c, nodes, size; + uint32_t coff; + + nodes = bit_size(node_map); + size = cr_get_coremap_offset(nodes); + bitstr_t *core_map = bit_alloc(size); + if (!core_map) + return NULL; + + if(!filter) + return core_map; + + nodes = bit_size(node_map); + for (n = 0, c = 0; n < nodes; n++) { + if (bit_test(node_map, n)) { + coff = cr_get_coremap_offset(n+1); + while (c < coff) { + bit_set(core_map, c++); + } + } + } + return core_map; +} + +/* Once here, avail_bitmap has nodes not used by any job o reservation */ +bitstr_t *sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, uint32_t core_cnt, bitstr_t **core_bitmap) +{ + bitstr_t *sp_avail_bitmap; + char str[100]; + + uint32_t cores_per_node = core_cnt / node_cnt; /* Just allowing symetric requests */ + + debug2("reserving %d cores per node in %d nodes", cores_per_node, node_cnt); + + sp_avail_bitmap = bit_alloc(bit_size(avail_bitmap)); + if(sp_avail_bitmap == NULL){ + fatal ("memory allocation failure"); + } + + bit_fmt(str, (sizeof(str) - 1), avail_bitmap); + + bit_fmt(str, (sizeof(str) - 1), sp_avail_bitmap); + + if(core_cnt){ /* Reservation is using partial nodes */ + + *core_bitmap = _make_core_bitmap_filtered(avail_bitmap, 0); + + while(core_cnt){ + uint32_t inx, coff; + int i; + + inx = bit_ffs(avail_bitmap); + if(inx < 0) + break; + + coff = cr_get_coremap_offset(inx); + + for(i = 0; i < cores_per_node; i++){ /* TODO: checking cores_per_nodes is lower than real cores per node */ + bit_set(*core_bitmap, coff++); + core_cnt--; + } + + /* Add this node to the final node bitmap */ + bit_set(sp_avail_bitmap, inx); + + /* Clear this node from the initial available bitmap */ + bit_clear(avail_bitmap, inx); + } + + bit_fmt(str, (sizeof(str) - 1), *core_bitmap); + + if(core_cnt){ + info("reservation request can not be satisfied"); + FREE_NULL_BITMAP(sp_avail_bitmap); + return NULL; + } + + } + else{ /* Reservation is using full nodes */ + + while(node_cnt){ + + uint32_t inx; + int i; + + inx = bit_ffs(avail_bitmap); + if(inx < 0) + break; + + /* Add this node to the final node bitmap */ + bit_set(sp_avail_bitmap, inx); + node_cnt--; + + /* Clear this node from the initial available bitmap */ + bit_clear(avail_bitmap, inx); + } + + if(node_cnt){ + info("reservation request can not be satisfied"); + FREE_NULL_BITMAP(sp_avail_bitmap); + return NULL; + } + + } + + //bit_fmt(str, (sizeof(str) - 1), sp_avail_bitmap); + //info("sequential pick using nodemap: %s", str); + + + return sp_avail_bitmap; +} + /* * select_p_resv_test - Identify the nodes which "best" satisfy a reservation * request. "best" is defined as either single set of consecutive nodes @@ -2526,9 +2644,10 @@ extern int select_p_reconfigure(void) * OR the fewest number of consecutive node sets * IN avail_bitmap - nodes available for the reservation * IN node_cnt - count of required nodes + * IN core_bitmap - cores which can not be used for this reservation * RET - nodes selected for use by the reservation */ -extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) +extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, uint32_t core_cnt, bitstr_t **core_bitmap) { bitstr_t **switches_bitmap; /* nodes on this switch */ int *switches_cpu_cnt; /* total CPUs on switch */ @@ -2536,7 +2655,8 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) int *switches_required; /* set if has required node */ bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */ - int rem_nodes; /* remaining resources desired */ + bitstr_t *sp_avail_bitmap; + int rem_nodes, rem_cores; /* remaining resources desired */ int i, j; int best_fit_inx, first, last; int best_fit_nodes; @@ -2544,13 +2664,16 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) bool sufficient; xassert(avail_bitmap); + if (!switch_record_cnt || !switch_record_table) - return bit_pick_cnt(avail_bitmap, node_cnt); + return sequential_pick(avail_bitmap, node_cnt, core_cnt, core_bitmap); /* Use topology state information */ if (bit_set_count(avail_bitmap) < node_cnt) return avail_nodes_bitmap; + rem_nodes = node_cnt; + rem_cores = core_cnt; /* Construct a set of switch array entries, * use the same indexes as switch_record_table in slurmctld */ @@ -2558,11 +2681,26 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) switches_cpu_cnt = xmalloc(sizeof(int) * switch_record_cnt); switches_node_cnt = xmalloc(sizeof(int) * switch_record_cnt); switches_required = xmalloc(sizeof(int) * switch_record_cnt); + for (i=0; i<switch_record_cnt; i++) { + bitstr_t *switch_bitmap_copy = xmalloc(sizeof(bitstr_t *)); + int node_cnt; switches_bitmap[i] = bit_copy(switch_record_table[i]. node_bitmap); bit_and(switches_bitmap[i], avail_bitmap); switches_node_cnt[i] = bit_set_count(switches_bitmap[i]); + switch_bitmap_copy = bit_copy(switches_bitmap[i]); + node_cnt = switches_node_cnt[i]; + debug2("switch %d looking cores in %d nodes\n", i, switches_node_cnt[i]); + while(node_cnt--){ + int node_inx; + node_inx = bit_ffs(switch_bitmap_copy); + switches_cpu_cnt[i] += cr_node_num_cores[node_inx]; + bit_nclear(switch_bitmap_copy, node_inx, node_inx); + } + + debug2("switch %d with %d nodes and %d cores\n", i, switches_node_cnt[i], switches_cpu_cnt[i]); + } #if SELECT_DEBUG @@ -2583,7 +2721,7 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) /* Determine lowest level switch satifying request with best fit */ best_fit_inx = -1; for (j=0; j<switch_record_cnt; j++) { - if (switches_node_cnt[j] < rem_nodes) + if ((switches_node_cnt[j] < rem_nodes) || (core_cnt && (switches_cpu_cnt[j] < core_cnt))) continue; if ((best_fit_inx == -1) || (switch_record_table[j].level < @@ -2591,6 +2729,7 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) ((switch_record_table[j].level == switch_record_table[best_fit_inx].level) && (switches_node_cnt[j] < switches_node_cnt[best_fit_inx]))) + /* ALEJ: We should use core count by switch here as well */ best_fit_inx = j; } if (best_fit_inx == -1) { @@ -2615,7 +2754,7 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) for (j=0; j<switch_record_cnt; j++) { if (switches_node_cnt[j] == 0) continue; - sufficient = (switches_node_cnt[j] >= rem_nodes); + sufficient = (switches_node_cnt[j] >= rem_nodes) && (switches_cpu_cnt[j] >= core_cnt); /* If first possibility OR */ /* first set large enough for request OR */ /* tightest fit (less resource waste) OR */ @@ -2637,8 +2776,9 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) first = bit_ffs(switches_bitmap[best_fit_location]); last = bit_fls(switches_bitmap[best_fit_location]); for (i=first; ((i<=last) && (first>=0)); i++) { - if (!bit_test(switches_bitmap[best_fit_location], i)) + if (!bit_test(switches_bitmap[best_fit_location], i)){ continue; + } bit_clear(switches_bitmap[best_fit_location], i); switches_node_cnt[best_fit_location]--; @@ -2650,6 +2790,8 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) } bit_set(avail_nodes_bitmap, i); + if(core_cnt) + rem_cores -= cr_node_num_cores[i]; if (--rem_nodes <= 0) break; } @@ -2660,11 +2802,61 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) fini: for (i=0; i<switch_record_cnt; i++) FREE_NULL_BITMAP(switches_bitmap[i]); + xfree(switches_bitmap); xfree(switches_cpu_cnt); xfree(switches_node_cnt); xfree(switches_required); + if(core_cnt){ /* Reservation is using partial nodes */ + + char str[100]; + int cores_per_node; + + sp_avail_bitmap = bit_alloc(bit_size(avail_bitmap)); + if(sp_avail_bitmap == NULL){ + fatal ("memory allocation failure"); + } + + *core_bitmap = _make_core_bitmap_filtered(avail_bitmap, 0); + + cores_per_node = core_cnt / node_cnt; + + while(core_cnt){ + uint32_t inx, coff; + int i; + + inx = bit_ffs(avail_bitmap); + if((inx < 0) || (inx > bit_size(avail_bitmap))) + break; + + debug2("Using node inx %d (cores_per_node: %d, core_cnt: %d", inx, cores_per_node, core_cnt); + coff = cr_get_coremap_offset(inx); + + for(i = 0; i < cores_per_node; i++){ /* TODO: checking cores_per_nodes is lower than real cores per node */ + bit_set(*core_bitmap, coff++); + core_cnt--; + } + + /* Add this node to the final node bitmap */ + bit_set(sp_avail_bitmap, inx); + + /* Clear this node from the initial available bitmap */ + bit_clear(avail_bitmap, inx); + } + + //bit_fmt(str, (sizeof(str) - 1), *core_bitmap); + //info("ALEJ: sequential pick using coremap: %s", str); + + if(core_cnt){ + info("reservation request can not be satisfied"); + FREE_NULL_BITMAP(sp_avail_bitmap); + return NULL; + } + + return sp_avail_bitmap; + } + return avail_nodes_bitmap; } diff --git a/src/plugins/select/cray/other_select.c b/src/plugins/select/cray/other_select.c index 18ddd3a219933cf147a3b1b41bd2072fd3351dfa..8b096d733ae19692fd3bd35d9562f6a46488d9e1 100644 --- a/src/plugins/select/cray/other_select.c +++ b/src/plugins/select/cray/other_select.c @@ -829,13 +829,13 @@ extern int other_reconfigure (void) * IN node_cnt - count of required nodes * RET - nodes selected for use by the reservation */ -extern bitstr_t * other_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) +extern bitstr_t * other_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, bitstr_t **core_bitmap) { if (other_select_init() < 0) return NULL; return (*(other_select_context->ops.resv_test)) - (avail_bitmap, node_cnt); + (avail_bitmap, node_cnt, 0, NULL); } extern void other_ba_init(node_info_msg_t *node_info_ptr, bool sanity_check) diff --git a/src/plugins/select/cray/other_select.h b/src/plugins/select/cray/other_select.h index ba2f746cebeba160e0ee6b4ed31b8adb2b0ba16c..e1b23f37652f89e034e05c2b7cad27762bb9b1be 100644 --- a/src/plugins/select/cray/other_select.h +++ b/src/plugins/select/cray/other_select.h @@ -371,7 +371,7 @@ extern int other_pack_select_info(time_t last_query_time, uint16_t show_flags, /* Note reconfiguration or change in partition configuration */ extern int other_reconfigure(void); -extern bitstr_t * other_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt); +extern bitstr_t * other_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, bitstr_t **core_bitmap); extern void other_ba_init(node_info_msg_t *node_info_ptr, bool sanity_check); extern void other_ba_fini(void); diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index 12db493c940f54648bb87ebb73cb54e3892e7a0c..b3d1bf997390ec7aa453951483fd3df5db08a878 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -811,9 +811,9 @@ extern int select_p_reconfigure(void) return other_reconfigure(); } -extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) +extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, bitstr_t **core_bitmap) { - return other_resv_test(avail_bitmap, node_cnt); + return other_resv_test(avail_bitmap, node_cnt, core_bitmap); } extern void select_p_ba_init(node_info_msg_t *node_info_ptr, bool sanity_check) diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 2bb3568e25f342921d108834766fc4ecbb45d92f..b839e06b8fc958f8f7bda85f826b5f6783703c46 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -3337,7 +3337,7 @@ extern int select_p_reconfigure(void) * IN node_cnt - count of required nodes * RET - nodes selected for use by the reservation */ -extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt) +extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, bitstr_t **core_bitmap) { bitstr_t **switches_bitmap; /* nodes on this switch */ int *switches_cpu_cnt; /* total CPUs on switch */ diff --git a/src/scontrol/create_res.c b/src/scontrol/create_res.c index ca68f1d8973519e22560edca878f409a071ceb7e..15b4d24daf2f01853371f397152042f20eaf34d4 100644 --- a/src/scontrol/create_res.c +++ b/src/scontrol/create_res.c @@ -283,6 +283,12 @@ scontrol_parse_res_options(int argc, char *argv[], const char *msg, tok = strtok_r(NULL, ",", &ptrptr); } xfree(node_cnt); + + } else if (strncasecmp(tag, "CoreCnt", MAX(taglen,5)) == 0 || + strncasecmp(tag, "CoreCount", MAX(taglen,5)) == 0) { + char *endptr = NULL; + resv_msg_ptr->core_cnt = strtol(val, &endptr, 10); + } else if (strncasecmp(tag, "Nodes", MAX(taglen, 5)) == 0) { resv_msg_ptr->node_list = val;