From 992fc7ceba54d472f6dd4e882f4aa7929edb9e55 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 28 Nov 2007 20:41:08 +0000 Subject: [PATCH] apply fixme.patch with fixes for chris holmes' recent max share work. --- src/common/slurm_resource_info.c | 4 +- src/plugins/select/cons_res/dist_tasks.c | 72 ++-- src/plugins/select/cons_res/select_cons_res.c | 340 +++++++++--------- src/plugins/select/cons_res/select_cons_res.h | 2 +- src/slurmctld/node_scheduler.c | 170 +++++---- 5 files changed, 325 insertions(+), 263 deletions(-) diff --git a/src/common/slurm_resource_info.c b/src/common/slurm_resource_info.c index 5e0268cf4b6..690f8df0bab 100644 --- a/src/common/slurm_resource_info.c +++ b/src/common/slurm_resource_info.c @@ -106,7 +106,7 @@ int slurm_get_avail_procs(const uint16_t max_sockets, *cores = 1; if (*sockets <= 0) *sockets = *cpus / *cores / *threads; - for (i = 0 ; i < *sockets; i++) { + for (i = 0 ; alloc_cores && i < *sockets; i++) { allocated_cores += alloc_cores[i]; if (alloc_cores[i]) allocated_sockets++; @@ -124,7 +124,7 @@ int slurm_get_avail_procs(const uint16_t max_sockets, info("get_avail_procs %u %s cr_type %d cpus %u alloc_ c %u s %u", job_id, name, cr_type, *cpus, allocated_cores, allocated_sockets); - for (i = 0; i < *sockets; i++) + for (i = 0; alloc_cores && i < *sockets; i++) info("get_avail_procs %u %s alloc_cores[%d] = %u", job_id, name, i, alloc_cores[i]); #endif diff --git a/src/plugins/select/cons_res/dist_tasks.c b/src/plugins/select/cons_res/dist_tasks.c index 0941d70a068..4a9cabe615e 100644 --- a/src/plugins/select/cons_res/dist_tasks.c +++ b/src/plugins/select/cons_res/dist_tasks.c @@ -98,15 +98,13 @@ int compute_c_b_task_dist(struct select_cr_job *job) /* scan all rows looking for the best fit, and return the offset */ int _find_offset(struct select_cr_job *job, const int job_index, - uint16_t cores, uint16_t sockets, + uint16_t cores, uint16_t sockets, uint32_t maxcores, const select_type_plugin_info_t cr_type, struct node_cr_record *this_cr_node) { struct part_cr_record *p_ptr; int i, j, index, offset, skip; - uint16_t acpus, asockets, freecpus, last_freecpus = 0; - uint32_t maxtasks = job->alloc_cpus[job_index]; - uint16_t uint16_tmp, threads, usable_threads; + uint16_t acores, asockets, freecpus, last_freecpus = 0; p_ptr = get_cr_part_ptr(this_cr_node, job->partition); if (p_ptr == NULL) @@ -115,14 +113,9 @@ int _find_offset(struct select_cr_job *job, const int job_index, fatal("cons_res: find_offset: could not find part %s", job->partition); - /* get thread count per core */ - get_resources_this_node(&uint16_tmp, &uint16_tmp, &uint16_tmp, - &threads, this_cr_node, job->job_id); - usable_threads = MIN(job->max_threads, threads); - index = -1; for (i = 0; i < p_ptr->num_rows; i++) { - acpus = 0; + acores = 0; asockets = 0; skip = 0; offset = i * this_cr_node->num_sockets; @@ -131,9 +124,9 @@ int _find_offset(struct select_cr_job *job, const int job_index, job->min_cores) { /* count the number of unusable sockets */ skip++; - acpus += cores; + acores += cores; } else { - acpus += p_ptr->alloc_cores[offset+j]; + acores += p_ptr->alloc_cores[offset+j]; } if(p_ptr->alloc_cores[offset+j]) asockets++; @@ -147,9 +140,8 @@ int _find_offset(struct select_cr_job *job, const int job_index, continue; } - freecpus = (cores * sockets) - acpus; - freecpus *= usable_threads; - if (freecpus < maxtasks) + freecpus = (cores * sockets) - acores; + if (freecpus < maxcores) continue; if (index < 0) { @@ -179,9 +171,9 @@ void _job_assign_tasks(struct select_cr_job *job, uint16_t cores, cpus, sockets, threads; uint16_t usable_cores, usable_sockets, usable_threads; uint16_t *avail_cores; - uint32_t taskcount, last_taskcount; + uint32_t corecount, last_corecount; uint16_t asockets, offset, total; - uint32_t maxtasks = job->alloc_cpus[job_index]; + uint32_t maxcores, reqcores, maxtasks = job->alloc_cpus[job_index]; struct part_cr_record *p_ptr; p_ptr = get_cr_part_ptr(this_cr_node, job->partition); @@ -200,15 +192,25 @@ void _job_assign_tasks(struct select_cr_job *job, usable_cores = MIN(job->max_cores, cores); usable_threads = MIN(job->max_threads, threads); - offset = _find_offset(job, job_index, cores, sockets, cr_type, + /* determine the number of required cores. When multiple threads + * are available, the maxtasks value may not reflect the requested + * core count, which is what we are seeking here. */ + maxcores = maxtasks / usable_threads; + while ((maxcores * usable_threads) < maxtasks) + maxcores++; + reqcores = job->min_cores * job->min_sockets; + if (maxcores < reqcores) + maxcores = reqcores; + + offset = _find_offset(job, job_index, cores, sockets, maxcores, cr_type, this_cr_node); job->node_offset[job_index] = offset; debug3("job_assign_task %u s_ min %u u %u c_ min %u u %u" - " t_ min %u u %u task %u offset %u", + " t_ min %u u %u task %u core %u offset %u", job->job_id, job->min_sockets, usable_sockets, job->min_cores, usable_cores, job->min_threads, - usable_threads, maxtasks, offset); + usable_threads, maxtasks, maxcores, offset); avail_cores = xmalloc(sizeof(uint16_t) * sockets); for (i = 0; i < sockets; i++) { @@ -218,7 +220,7 @@ void _job_assign_tasks(struct select_cr_job *job, total = 0; asockets = 0; for (i = 0; i < sockets; i++) { - if ((total >= maxtasks) && (asockets >= job->min_sockets)) { + if ((total >= maxcores) && (asockets >= job->min_sockets)) { break; } if (this_cr_node->node_ptr->cores <= @@ -237,7 +239,7 @@ void _job_assign_tasks(struct select_cr_job *job, avail_cores[i] = 0; } if (avail_cores[i] > 0) { - total += avail_cores[i]*usable_threads; + total += avail_cores[i]; asockets++; } } @@ -263,42 +265,42 @@ void _job_assign_tasks(struct select_cr_job *job, } if (asockets < job->min_sockets) { - error("cons_res: %u maxtasks %u Cannot satisfy" + error("cons_res: %u maxcores %u Cannot satisfy" " request -B %u:%u: Using -B %u:%u", - job->job_id, maxtasks, job->min_sockets, + job->job_id, maxcores, job->min_sockets, job->min_cores, asockets, job->min_cores); } - taskcount = 0; + corecount = 0; if (cyclic) { /* distribute tasks cyclically across the sockets */ - for (i=1; taskcount<maxtasks; i++) { - last_taskcount = taskcount; - for (j=0; ((j<sockets) && (taskcount<maxtasks)); j++) { + for (i=1; corecount<maxcores; i++) { + last_corecount = corecount; + for (j=0; ((j<sockets) && (corecount<maxcores)); j++) { if (avail_cores[j] == 0) continue; if (i<=avail_cores[j]) { job->alloc_cores[job_index][j]++; - taskcount += usable_threads; + corecount++; } } - if (last_taskcount == taskcount) { + if (last_corecount == corecount) { /* Avoid possible infinite loop on error */ fatal("_job_assign_tasks failure"); } } } else { /* distribute tasks in blocks across the sockets */ - for (j=0; ((j<sockets) && (taskcount<maxtasks)); j++) { - last_taskcount = taskcount; + for (j=0; ((j<sockets) && (corecount<maxcores)); j++) { + last_corecount = corecount; if (avail_cores[j] == 0) continue; for (i = 0; (i < avail_cores[j]) && - (taskcount<maxtasks); i++) { + (corecount<maxcores); i++) { job->alloc_cores[job_index][j]++; - taskcount += usable_threads; + corecount++; } - if (last_taskcount == taskcount) { + if (last_corecount == corecount) { /* Avoid possible infinite loop on error */ fatal("_job_assign_tasks failure"); } diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index e8b563a5d58..a1b8293d0b9 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -383,7 +383,7 @@ void chk_resize_node(struct node_cr_record *node, uint16_t sockets) node->node_ptr->name, node->num_sockets, sockets); for (p_ptr = node->parts; p_ptr; p_ptr = p_ptr->next) { xrealloc(p_ptr->alloc_cores, - sockets * sizeof(uint16_t)); + sockets * p_ptr->num_rows * sizeof(uint16_t)); /* NOTE: xrealloc zero fills added memory */ } node->num_sockets = sockets; @@ -483,23 +483,22 @@ uint16_t _get_cpu_data (struct part_cr_record *p_ptr, int num_sockets, } /* - * _get_avail_lps - Get the number of "available" cpus on a node - * given the number of cpus_per_task and - * maximum sockets, cores, threads. Note that the value of - * cpus is the lowest-level logical processor (LLLP). + * _get_task_count - Given the job requirements, compute the number of tasks + * this node can run + * * IN job_ptr - pointer to job being scheduled * IN index - index of node's configuration information in select_node_ptr */ -static uint16_t _get_avail_lps(struct job_record *job_ptr, const int index, - const bool all_available, bool try_partial_idle, - enum node_cr_state job_node_req) +static uint16_t _get_task_count(struct job_record *job_ptr, const int index, + const bool all_available, bool try_partial_idle, + enum node_cr_state job_node_req) { - uint16_t avail_cpus, cpus_per_task = 0; + uint16_t numtasks, cpus_per_task = 0; uint16_t max_sockets = 0, max_cores = 0, max_threads = 0; uint16_t min_sockets = 0, min_cores = 0, min_threads = 0; uint16_t ntasks_per_node = 0, ntasks_per_socket = 0, ntasks_per_core = 0; uint16_t i, cpus, sockets, cores, threads, *alloc_cores = NULL; - struct node_cr_record *this_cr_node; + struct node_cr_record *this_node; struct part_cr_record *p_ptr; struct multi_core_data *mc_ptr = NULL; @@ -519,46 +518,48 @@ static uint16_t _get_avail_lps(struct job_record *job_ptr, const int index, ntasks_per_core = mc_ptr->ntasks_per_core; } - this_cr_node = find_cr_node_record (select_node_ptr[index].node_ptr->name); - if (this_cr_node == NULL) { - error("cons_res: _get_avail_lps: could not find node %s", - select_node_ptr[index].node_ptr->name); - return 0; - } + this_node = &(select_node_ptr[index]); get_resources_this_node(&cpus, &sockets, &cores, &threads, - this_cr_node, job_ptr->job_id); + this_node, job_ptr->job_id); - chk_resize_node(this_cr_node, sockets); - alloc_cores = xmalloc(sizeof(uint16_t) * sockets); + chk_resize_node(this_node, sockets); + alloc_cores = xmalloc(sockets * sizeof(uint16_t)); for (i = 0; i < sockets; i++) alloc_cores[i] = 0; if (!all_available) { - p_ptr = get_cr_part_ptr(this_cr_node, job_ptr->partition); + p_ptr = get_cr_part_ptr(this_node, job_ptr->partition); if (!p_ptr) { - error("cons_res: _get_avail_lps: could not find part %s", + error("cons_res: _get_task_count: could not find part %s", job_ptr->part_ptr->name); } else { if (job_node_req == NODE_CR_ONE_ROW) { /* need to scan over all partitions with * num_rows = 1 */ - for (p_ptr = this_cr_node->parts; p_ptr; + for (p_ptr = this_node->parts; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->num_rows > 1) continue; - for (i = 0; i < sockets; i++) + for (i = 0; i < sockets; i++) { + if (cr_type == CR_SOCKET || + cr_type == CR_SOCKET_MEMORY) { + if (p_ptr->alloc_cores[i]) + alloc_cores[i] = cores; + } else { alloc_cores[i] = p_ptr->alloc_cores[i]; + } + } } } else { - /* job_node_req == *EXCLUSIVE | *AVAILABLE + /* job_node_req == EXCLUSIVE | AVAILABLE * if EXCLUSIVE, then node *should* be free and - * this code should fall through with alloc_cores - * all set to zero. + * this code should fall through with + * alloc_cores all set to zero. * if AVAILABLE then scan partition rows based * on 'try_partial_idle' setting. Note that * if 'try_partial_idle' is FALSE then this - * code should use a 'free' row and this is where - * a new row will first be evaluated. + * code should use a 'free' row and this is + * where a new row will first be evaluated. */ uint16_t count, max_cpus; int alloc_row, free_row; @@ -593,32 +594,29 @@ static uint16_t _get_avail_lps(struct job_record *job_ptr, const int index, } #if (CR_DEBUG) for (i = 0; i < sockets; i+=2) { - info("cons_res: _get_avail_lps: %s alloc_cores[%d]=%d, [%d]=%d", - this_cr_node->node_ptr->name, i, alloc_cores[i], i+1, - alloc_cores[i+1]); + info("cons_res: _get_task_count: %s alloc_cores[%d]=%d, [%d]=%d", + this_node->node_ptr->name, i, alloc_cores[i], + i+1, alloc_cores[i+1]); } #endif - avail_cpus = slurm_get_avail_procs(max_sockets, - max_cores, - max_threads, - min_sockets, - min_cores, - cpus_per_task, - ntasks_per_node, - ntasks_per_socket, - ntasks_per_core, - &cpus, &sockets, &cores, - &threads, alloc_cores, - cr_type, job_ptr->job_id, - this_cr_node->node_ptr->name); + numtasks = slurm_get_avail_procs(max_sockets, max_cores, max_threads, + min_sockets, min_cores, + cpus_per_task, + ntasks_per_node, + ntasks_per_socket, + ntasks_per_core, + &cpus, &sockets, &cores, + &threads, alloc_cores, + cr_type, job_ptr->job_id, + this_node->node_ptr->name); #if (CR_DEBUG) - info("cons_res: _get_avail_lps computed a_cpus %d s %d c %d t %d on %s for job %d", - avail_cpus, sockets, cores, threads, this_cr_node->node_ptr->name, + info("cons_res: _get_task_count computed a_tasks %d s %d c %d t %d on %s for job %d", + numtasks, sockets, cores, threads, this_node->node_ptr->name, job_ptr->job_id); #endif xfree(alloc_cores); - return(avail_cpus); + return(numtasks); } /* xfree an array of node_cr_record */ @@ -719,26 +717,32 @@ uint16_t _count_idle_cpus(struct node_cr_record *this_node) { struct part_cr_record *p_ptr; int i, j, index, idlecpus; + uint16_t cpus, sockets, cores, threads; if (this_node->node_state == NODE_CR_RESERVED) return (uint16_t) 0; - if (select_fast_schedule) { - idlecpus = this_node->node_ptr->config_ptr->cpus; - } else { - idlecpus = this_node->node_ptr->cpus; - } + get_resources_this_node(&cpus, &sockets, &cores, &threads, + this_node, 0); if (!this_node->parts) - return (uint16_t) idlecpus; + return cpus; + idlecpus = cpus; if (this_node->node_state == NODE_CR_ONE_ROW) { /* check single-row partitions for idle CPUs */ for (p_ptr = this_node->parts; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->num_rows > 1) continue; - for (i = 0; i < this_node->num_sockets; i++) + for (i = 0; i < this_node->num_sockets; i++) { + if (cr_type == CR_SOCKET || + cr_type == CR_SOCKET_MEMORY) { + if (p_ptr->alloc_cores[i]) + idlecpus -= cores; + } else { idlecpus -= p_ptr->alloc_cores[i]; + } + } if (idlecpus < 1) return (uint16_t) 0; } @@ -747,15 +751,23 @@ uint16_t _count_idle_cpus(struct node_cr_record *this_node) if (this_node->node_state == NODE_CR_AVAILABLE) { /* check all partitions for idle CPUs */ - int cpus, max_idle = 0; + int tmpcpus, max_idle = 0; for (p_ptr = this_node->parts; p_ptr; p_ptr = p_ptr->next) { - cpus = idlecpus; for (i = 0, index = 0; i < p_ptr->num_rows; i++) { - for (j = 0; j < this_node->num_sockets; j++, - index++) - cpus -= p_ptr->alloc_cores[index]; - if (cpus > max_idle) { - max_idle = cpus; + tmpcpus = idlecpus; + for (j = 0; + j < this_node->num_sockets; + j++, index++) { + if (cr_type == CR_SOCKET || + cr_type == CR_SOCKET_MEMORY) { + if (p_ptr->alloc_cores[i]) + tmpcpus -= cores; + } else { + tmpcpus -= p_ptr->alloc_cores[index]; + } + } + if (tmpcpus > max_idle) { + max_idle = tmpcpus; if (max_idle == idlecpus) break; } @@ -1629,37 +1641,36 @@ extern int select_p_block_init(List part_list) return SLURM_SUCCESS; } -/* return the number of cpus on the indexed node - that can be used by the given job */ -static int _get_avail_cpus(struct job_record *job_ptr, const int node_index, - int *av_cpus, int *freq, int size) +/* return the number of tasks that the given + job can run on the indexed node */ +static int _get_task_cnt(struct job_record *job_ptr, const int node_index, + int *task_cnt, int *freq, int size) { - int index, pos, cpus; + int i, pos, tasks; uint16_t * layout_ptr = NULL; if (job_ptr->details) layout_ptr = job_ptr->details->req_node_layout; pos = 0; - for (index = 0; index < size; index++) { - if (pos+freq[index] > node_index) + for (i = 0; i < size; i++) { + if (pos+freq[i] > node_index) break; - pos += freq[index]; + pos += freq[i]; } - cpus = av_cpus[index]; - if (layout_ptr && bit_test(job_ptr->details->req_node_bitmap, index)) { - pos = bit_get_pos_num(job_ptr->details->req_node_bitmap, - index); - cpus = MIN(cpus, layout_ptr[pos]); + tasks = task_cnt[i]; + if (layout_ptr && bit_test(job_ptr->details->req_node_bitmap, i)) { + pos = bit_get_pos_num(job_ptr->details->req_node_bitmap, i); + tasks = MIN(tasks, layout_ptr[pos]); } else if (layout_ptr) { - cpus = 0; /* should not happen? */ + tasks = 0; /* should not happen? */ } - return cpus; + return tasks; } int _eval_nodes(struct job_record *job_ptr, bitstr_t * bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, - int *av_cpus, int *freq, int array_size) + int *task_cnt, int *freq, int array_size) { int i, f, index, error_code = SLURM_ERROR; int *consec_nodes; /* how many nodes we can add from this @@ -1724,7 +1735,7 @@ int _eval_nodes(struct job_record *job_ptr, bitstr_t * bitmap, if (bit_test(bitmap, index)) { if (consec_nodes[consec_index] == 0) consec_start[consec_index] = index; - avail_cpus = av_cpus[i]; + avail_cpus = task_cnt[i]; if (layout_ptr && required_node){ avail_cpus = MIN(avail_cpus, layout_ptr[ll]); } else if (layout_ptr) { @@ -1822,9 +1833,9 @@ int _eval_nodes(struct job_record *job_ptr, bitstr_t * bitmap, bit_set(bitmap, i); rem_nodes--; max_nodes--; - avail_cpus = _get_avail_cpus(job_ptr, i, - av_cpus, freq, - array_size); + avail_cpus = _get_task_cnt(job_ptr, i, + task_cnt, freq, + array_size); rem_cpus -= avail_cpus; } for (i = (best_fit_req - 1); @@ -1834,9 +1845,9 @@ int _eval_nodes(struct job_record *job_ptr, bitstr_t * bitmap, break; if (bit_test(bitmap, i)) continue; - avail_cpus = _get_avail_cpus(job_ptr, i, - av_cpus, freq, - array_size); + avail_cpus = _get_task_cnt(job_ptr, i, + task_cnt, freq, + array_size); if(avail_cpus <= 0) continue; rem_cpus -= avail_cpus; @@ -1852,9 +1863,9 @@ int _eval_nodes(struct job_record *job_ptr, bitstr_t * bitmap, break; if (bit_test(bitmap, i)) continue; - avail_cpus = _get_avail_cpus(job_ptr, i, - av_cpus, freq, - array_size); + avail_cpus = _get_task_cnt(job_ptr, i, + task_cnt, freq, + array_size); if(avail_cpus <= 0) continue; rem_cpus -= avail_cpus; @@ -1887,12 +1898,12 @@ int _eval_nodes(struct job_record *job_ptr, bitstr_t * bitmap, /* this is an intermediary step between select_p_job_test and _eval_nodes * to tackle the knapsack problem. This code incrementally removes nodes - * with low CPU availability for the job and re-evaluates each result */ + * with low task counts for the job and re-evaluates each result */ int _select_nodes(struct job_record *job_ptr, bitstr_t * bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, - int *av_cpus, int *freq, int array_size) + int *task_cnt, int *freq, int array_size) { - int i, b, count, ec, max_av_cpus = 0; + int i, b, count, ec, most_tasks = 0; bitstr_t *origmap, *reqmap = NULL; /* allocated node count should never exceed num_procs, right? @@ -1907,7 +1918,7 @@ int _select_nodes(struct job_record *job_ptr, bitstr_t * bitmap, fatal("bit_copy malloc failure"); ec = _eval_nodes(job_ptr, bitmap, min_nodes, max_nodes, - req_nodes, av_cpus, freq, array_size); + req_nodes, task_cnt, freq, array_size); if (ec == SLURM_SUCCESS) { bit_free(origmap); @@ -1915,21 +1926,21 @@ int _select_nodes(struct job_record *job_ptr, bitstr_t * bitmap, } /* This nodeset didn't work. To avoid a possible knapsack problem, - * incrementally remove nodes with low CPU availability and retry */ + * incrementally remove nodes with low task counts and retry */ for (i = 0; i < array_size; i++) { - if (av_cpus[i] > max_av_cpus) - max_av_cpus = av_cpus[i]; + if (task_cnt[i] > most_tasks) + most_tasks = task_cnt[i]; } if (job_ptr->details && job_ptr->details->req_node_bitmap) reqmap = job_ptr->details->req_node_bitmap; - for (count = 0; count < max_av_cpus; count++) { + for (count = 0; count < most_tasks; count++) { int nochange = 1; bit_or(bitmap, origmap); for (i = 0, b = 0; i < array_size; i++) { - if (av_cpus[i] != -1 && av_cpus[i] <= count) { + if (task_cnt[i] != -1 && task_cnt[i] <= count) { int j = 0, x = b; for (; j < freq[i]; j++, x++) { if (!bit_test(bitmap, x)) @@ -1948,7 +1959,7 @@ int _select_nodes(struct job_record *job_ptr, bitstr_t * bitmap, if (nochange) continue; ec = _eval_nodes(job_ptr, bitmap, min_nodes, max_nodes, - req_nodes, av_cpus, freq, array_size); + req_nodes, task_cnt, freq, array_size); if (ec == SLURM_SUCCESS) { bit_free(origmap); return ec; @@ -2146,78 +2157,79 @@ int _get_allocated_rows(struct job_record *job_ptr, int n, return rows; } -int _load_arrays(struct job_record *job_ptr, bitstr_t *bitmap, int **al_rows, - int **mf_cpus, int **al_cpus, int **freq, bool test_only, +int _load_arrays(struct job_record *job_ptr, bitstr_t *bitmap, int **a_rows, + int **s_tasks, int **a_tasks, int **freq, bool test_only, enum node_cr_state job_node_req) { int i, index = 0, size = 32; - int *alloc_rows, *mfree_cpus, *all_cpus, *num_nodes; + int *busy_rows, *shr_tasks, *all_tasks, *num_nodes; - alloc_rows = xmalloc (sizeof(int)*size); /* allocated rows */ - mfree_cpus = xmalloc (sizeof(int)*size); /* max free cpus */ - all_cpus = xmalloc (sizeof(int)*size); /* all cpus */ - num_nodes = xmalloc (sizeof(int)*size); /* number of nodes */ - alloc_rows[index] = 0; - mfree_cpus[index] = 0; - all_cpus[index] = 0; - num_nodes[index] = 0; + busy_rows = xmalloc (sizeof(int)*size); /* allocated rows */ + shr_tasks = xmalloc (sizeof(int)*size); /* max free cpus */ + all_tasks = xmalloc (sizeof(int)*size); /* all cpus */ + num_nodes = xmalloc (sizeof(int)*size); /* number of nodes */ + busy_rows[index] = 0; + shr_tasks[index] = 0; + all_tasks[index] = 0; + num_nodes[index] = 0; for (i = 0; i < select_node_cnt; i++) { if (bit_test(bitmap, i)) { - int rows, pcpu, cpus; + int rows; + uint16_t atasks, ptasks; rows = _get_allocated_rows(job_ptr, i, job_node_req); /* false = use free rows (if available) */ - cpus = _get_avail_lps(job_ptr, i, test_only, false, - job_node_req); + atasks = _get_task_count(job_ptr, i, test_only, false, + job_node_req); if (test_only) { - pcpu = cpus; + ptasks = atasks; } else { /* true = try using an already allocated row */ - pcpu = _get_avail_lps(job_ptr, i, test_only, - true, job_node_req); + ptasks = _get_task_count(job_ptr, i, test_only, + true, job_node_req); } - if (rows != alloc_rows[index] || - pcpu != mfree_cpus[index] || - cpus != all_cpus[index]) { + if (rows != busy_rows[index] || + ptasks != shr_tasks[index] || + atasks != all_tasks[index]) { if (num_nodes[index]) { index++; if (index >= size) { size *= 2; - xrealloc(alloc_rows, + xrealloc(busy_rows, sizeof(int)*size); - xrealloc(mfree_cpus, + xrealloc(shr_tasks, sizeof(int)*size); - xrealloc(all_cpus, + xrealloc(all_tasks, sizeof(int)*size); xrealloc(num_nodes, sizeof(int)*size); } num_nodes[index] = 0; } - alloc_rows[index] = rows; - mfree_cpus[index] = pcpu; - all_cpus[index] = cpus; + busy_rows[index] = rows; + shr_tasks[index] = ptasks; + all_tasks[index] = atasks; } } else { - if (alloc_rows[index] != -1) { + if (busy_rows[index] != -1) { if (num_nodes[index] > 0) { index++; if (index >= size) { size *= 2; - xrealloc(alloc_rows, + xrealloc(busy_rows, sizeof(int)*size); - xrealloc(mfree_cpus, + xrealloc(shr_tasks, sizeof(int)*size); - xrealloc(all_cpus, + xrealloc(all_tasks, sizeof(int)*size); xrealloc(num_nodes, sizeof(int)*size); } num_nodes[index] = 0; } - alloc_rows[index] = -1; - mfree_cpus[index] = -1; - all_cpus[index] = -1; + busy_rows[index] = -1; + shr_tasks[index] = -1; + all_tasks[index] = -1; } } num_nodes[index]++; @@ -2226,14 +2238,13 @@ int _load_arrays(struct job_record *job_ptr, bitstr_t *bitmap, int **al_rows, index++; for (i = 0; i < index; i++) { - debug3("cons_res: i %d row %d fcpus %d acpus %d freq %d", - i, alloc_rows[i], mfree_cpus[i], all_cpus[i], - num_nodes[i]); + debug3("cons_res: i %d row %d ptasks %d atasks %d freq %d", + i, busy_rows[i], shr_tasks[i], all_tasks[i], num_nodes[i]); } - *al_rows = alloc_rows; - *mf_cpus = mfree_cpus; - *al_cpus = all_cpus; + *a_rows = busy_rows; + *s_tasks = shr_tasks; + *a_tasks = all_tasks; *freq = num_nodes; return index; @@ -2275,7 +2286,7 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, uint16_t plane_size = 0; enum node_cr_state job_node_req; int array_size; - int *al_rows, *mf_cpus, *al_cpus, *freq; + int *busy_rows, *sh_tasks, *al_tasks, *freq; bitstr_t *origmap, *reqmap = NULL; int row, rows, try; @@ -2316,17 +2327,17 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, } /* compute condensed arrays of node allocation data */ - array_size = _load_arrays(job_ptr, bitmap, &al_rows, &mf_cpus, - &al_cpus, &freq, test_only, job_node_req); + array_size = _load_arrays(job_ptr, bitmap, &busy_rows, &sh_tasks, + &al_tasks, &freq, test_only, job_node_req); if (test_only) { /* try with all nodes and all possible cpus */ error_code = _select_nodes(job_ptr, bitmap, min_nodes, - max_nodes, req_nodes, al_cpus, freq, + max_nodes, req_nodes, al_tasks, freq, array_size); - xfree(al_rows); - xfree(mf_cpus); - xfree(al_cpus); + xfree(busy_rows); + xfree(sh_tasks); + xfree(al_tasks); xfree(freq); return error_code; } @@ -2353,11 +2364,11 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, /* Step 1: * remove nodes from bitmap (unless required) - * who's al_rows value is bigger than 'row'. + * who's busy_rows value is bigger than 'row'. * Why? to enforce "least-loaded" over * "contiguous" */ - if (al_rows[i] > row || - (al_rows[i] == row && mf_cpus[i] == 0)) { + if (busy_rows[i] > row || + (busy_rows[i] == row && sh_tasks[i] == 0)) { for (j = f; j < f+freq[i]; j++) { if (reqmap && bit_test(reqmap, j)) @@ -2370,31 +2381,30 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, if (try == 0) continue; /* Step 2: - * set mf_cpus = al_cpus for nodes who's - * al_rows value is < 'row'. + * set sh_tasks = al_tasks for nodes who's + * busy_rows value is < 'row'. * Why? to select a new row for these * nodes when appropriate */ - if (al_rows[i] == -1 || - al_rows[i] >= row) + if (busy_rows[i] == -1 || busy_rows[i] >= row) continue; - if (mf_cpus[i] == al_cpus[i]) + if (sh_tasks[i] == al_tasks[i]) continue; - if (try == 1 && mf_cpus[i] != 0) + if (try == 1 && sh_tasks[i] != 0) continue; - mf_cpus[i] = al_cpus[i]; + sh_tasks[i] = al_tasks[i]; } if (bit_set_count(bitmap) < min_nodes) break; for (i = 0; i < array_size; i++) { - debug3("cons_res: i %d row %d fcpus %d acpus %d freq %d", - i, al_rows[i], mf_cpus[i], al_cpus[i], - freq[i]); + debug3("cons_res: i %d row %d stasks %d atasks %d freq %d", + i, busy_rows[i], sh_tasks[i], + al_tasks[i], freq[i]); } error_code = _select_nodes(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, - mf_cpus, freq, array_size); + sh_tasks, freq, array_size); if (error_code == SLURM_SUCCESS) break; } @@ -2404,9 +2414,9 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, bit_free(origmap); if (error_code != SLURM_SUCCESS) { - xfree(al_rows); - xfree(mf_cpus); - xfree(al_cpus); + xfree(busy_rows); + xfree(sh_tasks); + xfree(al_tasks); xfree(freq); return error_code; } @@ -2475,8 +2485,8 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, break; } job->host[j] = xstrdup(node_record_table_ptr[i].name); - job->cpus[j] = mf_cpus[a]; - row += mf_cpus[a]; + job->cpus[j] = sh_tasks[a]; + row += sh_tasks[a]; if (layout_ptr && bit_test(job_ptr->details->req_node_bitmap, i)) { job->cpus[j] = MIN(job->cpus[j], layout_ptr[ll]); @@ -2495,9 +2505,9 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, j++; } - xfree(al_rows); - xfree(mf_cpus); - xfree(al_cpus); + xfree(busy_rows); + xfree(sh_tasks); + xfree(al_tasks); xfree(freq); /* When 'srun --overcommit' is used, nprocs is set to a minimum value diff --git a/src/plugins/select/cons_res/select_cons_res.h b/src/plugins/select/cons_res/select_cons_res.h index 304318f6745..1e9fd82896b 100644 --- a/src/plugins/select/cons_res/select_cons_res.h +++ b/src/plugins/select/cons_res/select_cons_res.h @@ -87,7 +87,7 @@ struct part_cr_record { * node_cr_record.node_state assists with the unique state of each node. * NOTES: * - If node is in use by Shared=NO part, some CPUs/memory may be available - * - Caution with NODE_CR_AVAILBLE: a Sharing partition could be full!! + * - Caution with NODE_CR_AVAILABLE: a Sharing partition could be full!! */ enum node_cr_state { NODE_CR_RESERVED, /* node is NOT available for use by any other jobs */ diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 97541ade043..9495926ada6 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -355,36 +355,65 @@ _job_count_bitmap(bitstr_t * bitmap, bitstr_t * jobmap, int job_cnt) * IN cons_res_flag - 1 if the consumable resources flag is enable, 0 otherwise * * RET - 1 if nodes can be shared, 0 if nodes cannot be shared + * + * + * The followed table details the node SHARED state for the various scenarios + * + * part= part= part= part= + * cons_res user_request EXCLUS NO YES FORCE + * -------- ------------ ------ ----- ----- ----- + * no default/exclus whole whole whole share/O + * no share=yes whole whole share/O share/O + * yes default whole share share/O share/O + * yes exclusive whole whole whole share/O + * yes share=yes whole share share/O share/O + * + * whole = whole node is allocated exclusively to the user + * share = nodes may be shared but the resources are not overcommitted + * share/O = nodes are shared and the resources can be overcommitted + * + * part->max_share: + * &SHARED_FORCE = FORCE + * 0 = EXCLUSIVE + * 1 = NO + * > 1 = YES + * + * job_ptr->details->shared: + * (uint16_t)NO_VAL = default + * 0 = exclusive + * 1 = share=yes + * + * Here are the desired scheduler actions to take: + * IF cons_res enabled, THEN 'shared' ensures that p_i_bitmap is used AND + * _pick_best_load IS NOT called + * IF cons_res NOT enabled, THEN 'shared' ensures that share_bitmap is used AND + * _pick_best_load IS called */ static int _resolve_shared_status(uint16_t user_flag, uint16_t part_max_share, int cons_res_flag) { - int shared; + /* no sharing if part=EXCLUSIVE */ + if (part_max_share == 0) + return 0; + /* sharing if part=FORCE */ + if (part_max_share & SHARED_FORCE) + return 1; if (cons_res_flag) { - /* - * Consumable resources will always share nodes by default, - * the partition or user has to explicitly disable sharing to - * get exclusive nodes. - */ - if ((part_max_share == 0) || (user_flag == 0)) - shared = 0; - else - shared = 1; + /* sharing unless user requested exclusive */ + if (user_flag == 0) + return 0; + return 1; } else { - /* The partition sharing option is only used if - * the consumable resources plugin is NOT in use. - */ - if (part_max_share & SHARED_FORCE) /* shared=force */ - shared = 1; - else if (part_max_share <= 1) /* can't share */ - shared = 0; - else - shared = (user_flag == 1) ? 1 : 0; + /* no sharing if part=NO */ + if (part_max_share == 1) + return 0; + /* share if the user requested it */ + if (user_flag == 1) + return 1; } - - return shared; + return 0; } /* @@ -573,7 +602,6 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, int max_feature, min_feature; bool runable_ever = false; /* Job can ever run */ bool runable_avail = false; /* Job can run with available nodes */ - bool pick_light_load = false; uint32_t cr_enabled = 0; int shared = 0; select_type_plugin_info_t cr_type = SELECT_TYPE_INFO_NONE; @@ -594,29 +622,20 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, job_ptr->details->shared = shared; if (cr_enabled) { - shared = 0; job_ptr->cr_enabled = cr_enabled; /* CR enabled for this job */ cr_type = (select_type_plugin_info_t) slurmctld_conf. select_type_param; - if (cr_type == CR_MEMORY) { - shared = 1; /* Sharing set when only memory - * as a CR is enabled */ - } else if ((cr_type == CR_SOCKET) - || (cr_type == CR_CORE) - || (cr_type == CR_CPU)) { + if ((cr_type == CR_CORE) || + (cr_type == CR_CPU) || (cr_type == CR_SOCKET)) { job_ptr->details->job_max_memory = 0; } - debug3("Job %u in exclusive mode? " - "%d cr_enabled %d CR type %d num_procs %d", - job_ptr->job_id, - job_ptr->details->shared ? 0 : 1, - cr_enabled, - cr_type, + debug3("Job %u shared %d cr_enabled %d CR type %d num_procs %d", + job_ptr->job_id, shared, cr_enabled, cr_type, job_ptr->num_procs); - if (job_ptr->details->shared == 0) { + if (shared == 0) { partially_idle_node_bitmap = bit_copy(idle_node_bitmap); } else { /* Update partially_idle_node_bitmap to reflect the @@ -655,8 +674,13 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; } - /* shared needs to be checked before cr_enabled - * to make sure that CR_MEMORY works correctly */ + if (cr_enabled) { + if (!bit_super_set(job_ptr->details->req_node_bitmap, + partially_idle_node_bitmap)) { + FREE_NULL_BITMAP(partially_idle_node_bitmap); + return ESLURM_NODES_BUSY; + } + } if (shared) { if (!bit_super_set(job_ptr->details->req_node_bitmap, share_node_bitmap)) { @@ -665,15 +689,12 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, partially_idle_node_bitmap); return ESLURM_NODES_BUSY; } - } else if (cr_enabled) { - if (!bit_super_set(job_ptr->details->req_node_bitmap, - partially_idle_node_bitmap)) { - FREE_NULL_BITMAP(partially_idle_node_bitmap); - return ESLURM_NODES_BUSY; - } } else { if (!bit_super_set(job_ptr->details->req_node_bitmap, idle_node_bitmap)) { + if (cr_enabled) + FREE_NULL_BITMAP( + partially_idle_node_bitmap); return ESLURM_NODES_BUSY; } } @@ -682,11 +703,6 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, * set-up before job launch */ total_nodes = 0; /* reinitialize */ } - -#ifndef HAVE_BG - if (shared) - pick_light_load = true; -#endif /* identify the min and max feature values for exclusive OR */ max_feature = -1; @@ -727,14 +743,13 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, } bit_and(node_set_ptr[i].my_bitmap, avail_node_bitmap); - /* shared needs to be checked before cr_enabled - * to make sure that CR_MEMORY works correctly. */ + if (cr_enabled) { + bit_and(node_set_ptr[i].my_bitmap, + partially_idle_node_bitmap); + } if (shared) { bit_and(node_set_ptr[i].my_bitmap, share_node_bitmap); - } else if (cr_enabled) { - bit_and(node_set_ptr[i].my_bitmap, - partially_idle_node_bitmap); } else { bit_and(node_set_ptr[i].my_bitmap, idle_node_bitmap); @@ -759,8 +774,10 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, FREE_NULL_BITMAP(possible_bitmap); return error_code; } - if (pick_light_load) +#ifndef HAVE_BG + if (shared) continue; /* Keep accumulating */ +#endif if (avail_nodes == 0) continue; /* Keep accumulating */ if ((job_ptr->details->req_node_bitmap) && @@ -804,10 +821,43 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, avail_bitmap = backup_bitmap; } } /* for (i = 0; i < node_set_size; i++) */ +#ifndef HAVE_BG + pick_code = 1; + if (job_ptr->details->req_node_bitmap && + !bit_super_set(job_ptr->details->req_node_bitmap, + avail_bitmap)) + pick_code = 0; + if ((avail_nodes < min_nodes) || + ((req_nodes > min_nodes) && (avail_nodes < req_nodes))) + pick_code = 0; + if (avail_cpus < job_ptr->num_procs) + pick_code = 0; + + if (pick_code && cr_enabled) { + /* now that we have all possible resources, + * let's call the select plugin */ + backup_bitmap = bit_copy(avail_bitmap); + pick_code = select_g_job_test(job_ptr, + avail_bitmap, + min_nodes, + max_nodes, + req_nodes, + false); - /* try picking the lightest load from all - available nodes with this feature set */ - if (pick_light_load) { + if (pick_code == SLURM_SUCCESS) { + FREE_NULL_BITMAP(backup_bitmap); + FREE_NULL_BITMAP(total_bitmap); + FREE_NULL_BITMAP(possible_bitmap); + FREE_NULL_BITMAP(partially_idle_node_bitmap); + *select_bitmap = avail_bitmap; + return SLURM_SUCCESS; + } else { + FREE_NULL_BITMAP(avail_bitmap); + avail_bitmap = backup_bitmap; + } + } else if (pick_code && shared) { + /* try picking the lightest load from all + available nodes with this feature set */ backup_bitmap = bit_copy(avail_bitmap); pick_code = _pick_best_load(job_ptr, avail_bitmap, @@ -834,7 +884,7 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, avail_bitmap = backup_bitmap; } } - +#endif /* try to get req_nodes now for this feature */ if (avail_bitmap && (req_nodes > min_nodes) @@ -1040,7 +1090,7 @@ _add_node_set_info(struct node_set *node_set_ptr, /* IF there's at least one CPU available AND there's * memory available for this job on this node THEN * log the node... */ - if ((this_cpu_cnt > 0) && (this_mem_cnt > 0)) { + if ((this_cpu_cnt > 0) && (this_mem_cnt >= 0)) { *node_cnt += 1; *cpu_cnt += this_cpu_cnt; -- GitLab