From 75998c739e654c1a2322c209455a9a086d62823f Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 2 Oct 2008 19:39:13 +0000 Subject: [PATCH] apply hang_fix2.patch from Chris Holmes --- src/plugins/select/cons_res/job_test.c | 44 ++++--- src/plugins/select/cons_res/job_test.h | 3 +- src/plugins/select/cons_res/select_cons_res.c | 120 +++++++++++------- src/plugins/select/cons_res/select_cons_res.h | 9 +- 4 files changed, 108 insertions(+), 68 deletions(-) diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index f2a0552c09b..f6602eb2061 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -584,6 +584,7 @@ fini: */ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, const uint32_t node_i, + struct node_use_record *node_usage, select_type_plugin_info_t cr_type) { uint16_t cpus; @@ -617,7 +618,7 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, */ req_mem = job_ptr->details->job_min_memory & ~MEM_PER_CPU; avail_mem = select_node_record[node_i].real_memory - - select_node_record[node_i].alloc_memory; + node_usage[node_i].alloc_memory; if (job_ptr->details->job_min_memory & MEM_PER_CPU) { /* memory is per-cpu */ while (cpus > 0 && (req_mem * cpus) > avail_mem) @@ -634,8 +635,8 @@ uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map, debug3("cons_res: _can_job_run_on_node: %u cpus on %s(%d), mem %u/%u", cpus, select_node_record[node_i].node_ptr->name, - select_node_record[node_i].node_state, - select_node_record[node_i].alloc_memory, + node_usage[node_i].node_state, + node_usage[node_i].alloc_memory, select_node_record[node_i].real_memory); return cpus; @@ -691,6 +692,7 @@ static int _is_node_busy(struct part_res_record *p_ptr, uint32_t node_i, static int _verify_node_state(struct part_res_record *cr_part_ptr, struct job_record *job_ptr, bitstr_t * bitmap, select_type_plugin_info_t cr_type, + struct node_use_record *node_usage, enum node_cr_state job_node_req) { int i; @@ -707,7 +709,7 @@ static int _verify_node_state(struct part_res_record *cr_part_ptr, ((cr_type == CR_CORE_MEMORY) || (cr_type == CR_CPU_MEMORY) || (cr_type == CR_MEMORY) || (cr_type == CR_SOCKET_MEMORY))) { free_mem = select_node_record[i].real_memory; - free_mem -= select_node_record[i].alloc_memory; + free_mem -= node_usage[i].alloc_memory; if (free_mem < min_mem) { debug3("cons_res: _vns: node %s no mem %u < %u", select_node_record[i].node_ptr->name, @@ -732,13 +734,13 @@ static int _verify_node_state(struct part_res_record *cr_part_ptr, continue; /* exclusive node check */ - if (select_node_record[i].node_state == NODE_CR_RESERVED) { + if (node_usage[i].node_state == NODE_CR_RESERVED) { debug3("cons_res: _vns: node %s in exclusive use", select_node_record[i].node_ptr->name); goto clear_bit; /* non-resource-sharing node check */ - } else if (select_node_record[i].node_state == + } else if (node_usage[i].node_state == NODE_CR_ONE_ROW) { if ((job_node_req == NODE_CR_RESERVED) || (job_node_req == NODE_CR_AVAILABLE)) { @@ -849,6 +851,7 @@ static int _get_cpu_cnt(struct job_record *job_ptr, const int node_index, */ uint32_t _get_res_usage(struct job_record *job_ptr, bitstr_t *node_map, bitstr_t *core_map, uint32_t cr_node_cnt, + struct node_use_record *node_usage, select_type_plugin_info_t cr_type, uint16_t **cpu_cnt_ptr, uint32_t **freq_ptr) { @@ -862,7 +865,7 @@ uint32_t _get_res_usage(struct job_record *job_ptr, bitstr_t *node_map, for (n = 0; n < cr_node_cnt; n++) { if (bit_test(node_map, n)) { cpu_count = _can_job_run_on_node(job_ptr, core_map, - n, cr_type); + n, node_usage, cr_type); if (cpu_count == cpu_cnt[size]) { freq[size]++; continue; @@ -1230,6 +1233,7 @@ static uint16_t *_select_nodes(struct job_record *job_ptr, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, bitstr_t *node_map, uint32_t cr_node_cnt, bitstr_t *core_map, + struct node_use_record *node_usage, select_type_plugin_info_t cr_type) { int rc; @@ -1241,7 +1245,7 @@ static uint16_t *_select_nodes(struct job_record *job_ptr, uint32_t min_nodes, /* get resource usage for this job from each available node */ size = _get_res_usage(job_ptr, node_map, core_map, cr_node_cnt, - cr_type, &cpu_cnt, &freq); + node_usage, cr_type, &cpu_cnt, &freq); /* choose the best nodes for the job */ rc = _choose_nodes(job_ptr, node_map, min_nodes, max_nodes, req_nodes, @@ -1300,7 +1304,8 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t req_nodes, int mode, select_type_plugin_info_t cr_type, enum node_cr_state job_node_req, uint32_t cr_node_cnt, - struct part_res_record *cr_part_ptr) + struct part_res_record *cr_part_ptr, + struct node_use_record *node_usage) { int error_code = SLURM_SUCCESS, ll; /* ll = layout array index */ uint16_t *layout_ptr = NULL; @@ -1329,7 +1334,8 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, /* check node_state and update the node bitmap as necessary */ if (!test_only) { error_code = _verify_node_state(cr_part_ptr, job_ptr, - bitmap, cr_type, job_node_req); + bitmap, cr_type, node_usage, + job_node_req); if (error_code != SLURM_SUCCESS) { if (save_mem) job_ptr->details->job_min_memory = save_mem; @@ -1358,7 +1364,8 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, */ free_cores = bit_copy(avail_cores); cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, - bitmap, cr_node_cnt, free_cores, cr_type); + bitmap, cr_node_cnt, free_cores, + node_usage, cr_type); if (cpu_count == NULL) { /* job cannot fit */ bit_free(orig_map); @@ -1437,7 +1444,8 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, } } cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, - bitmap, cr_node_cnt, free_cores, cr_type); + bitmap, cr_node_cnt, free_cores, + node_usage, cr_type); if (cpu_count) { /* job fits! We're done. */ debug3("cons_res: cr_job_test: test 1 pass - " @@ -1476,7 +1484,8 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, /* make these changes permanent */ bit_copybits(avail_cores, free_cores); cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, - bitmap, cr_node_cnt, free_cores, cr_type); + bitmap, cr_node_cnt, free_cores, + node_usage, cr_type); if (!cpu_count) { /* job needs resources that are currently in use by * higher-priority jobs, so fail for now */ @@ -1507,7 +1516,8 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, } } cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, - bitmap, cr_node_cnt, free_cores, cr_type); + bitmap, cr_node_cnt, free_cores, + node_usage, cr_type); if (cpu_count) { /* lo-pri jobs are the only thing left in our way. * for now we'll ignore them, but FIXME: we need @@ -1540,7 +1550,7 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, bit_copybits(free_cores, avail_cores); cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, bitmap, cr_node_cnt, - free_cores, cr_type); + free_cores, node_usage, cr_type); debug3("cons_res: cr_job_test: test 4 pass - first row found"); goto alloc_job; } @@ -1556,7 +1566,7 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, bit_and(free_cores, tmpcore); cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, bitmap, cr_node_cnt, - free_cores, cr_type); + free_cores, node_usage, cr_type); if (cpu_count) { debug3("cons_res: cr_job_test: test 4 pass - row %i",i); break; @@ -1571,7 +1581,7 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, debug3("cons_res: cr_job_test: test 4 trying empty row %i",i); cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, bitmap, cr_node_cnt, - free_cores, cr_type); + free_cores, node_usage, cr_type); } if (!cpu_count) { diff --git a/src/plugins/select/cons_res/job_test.h b/src/plugins/select/cons_res/job_test.h index 9a995e6be38..64ec876096c 100644 --- a/src/plugins/select/cons_res/job_test.h +++ b/src/plugins/select/cons_res/job_test.h @@ -63,6 +63,7 @@ int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, int mode, select_type_plugin_info_t cr_type, enum node_cr_state job_node_req, uint32_t cr_node_cnt, - struct part_res_record *cr_part_ptr); + struct part_res_record *cr_part_ptr, + struct node_use_record *node_usage); #endif /* !_CR_JOB_TEST_H */ diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 81c41de2e4a..778e6963ca9 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -147,8 +147,9 @@ uint16_t select_fast_schedule; uint16_t *cr_node_num_cores = NULL; uint32_t *cr_num_core_count = NULL; -struct node_res_record *select_node_record = NULL; struct part_res_record *select_part_record = NULL; +struct node_res_record *select_node_record = NULL; +struct node_use_record *select_node_usage = NULL; static int select_node_cnt = 0; @@ -181,8 +182,8 @@ static void _dump_nodes() select_node_record[i].sockets, select_node_record[i].vpus, select_node_record[i].real_memory, - select_node_record[i].alloc_memory, - select_node_record[i].node_state); + select_node_usage[i].alloc_memory, + select_node_usage[i].node_state); } } @@ -353,6 +354,26 @@ static struct part_res_record *_dup_part_data(struct part_res_record *orig_ptr) } +/* Create a duplicate part_res_record list */ +static struct node_use_record *_dup_node_usage(struct node_use_record *orig_ptr) +{ + struct node_use_record *new_use_ptr, *new_ptr; + uint32_t i; + + if (orig_ptr == NULL) + return NULL; + + new_use_ptr = xmalloc(select_node_cnt * sizeof(struct node_use_record)); + new_ptr = new_use_ptr; + + for (i = 0; i < select_node_cnt; i++) { + new_ptr[i].node_state = orig_ptr[i].node_state; + new_ptr[i].alloc_memory = orig_ptr[i].alloc_memory; + } + return new_use_ptr; +} + + /* delete the given list of partition data */ static void _destroy_part_data(struct part_res_record *this_ptr) { @@ -459,11 +480,12 @@ struct part_res_record *_get_cr_part_ptr(struct part_record *part_ptr) } -/* delete the select_node_record array */ -static void _destroy_node_data() +/* delete the given select_node_record and select_node_usage arrays */ +static void _destroy_node_data(struct node_use_record *node_usage, + struct node_res_record *node_data) { - xfree(select_node_record); - select_node_record = NULL; + xfree(node_data); + xfree(node_usage); } @@ -763,13 +785,13 @@ static int _add_job_to_res(struct job_record *job_ptr, int action) for (i = 0, n = 0; i < select_node_cnt; i++) { if (!bit_test(job->node_bitmap, i)) continue; - select_node_record[i].alloc_memory += + select_node_usage[i].alloc_memory += job->memory_allocated[n]; - if (select_node_record[i].alloc_memory > + if (select_node_usage[i].alloc_memory > select_node_record[i].real_memory) { error("error: node %s mem is overallocated(%d)", select_node_record[i].node_ptr->name, - select_node_record[i].alloc_memory); + select_node_usage[i].alloc_memory); } n++; @@ -817,7 +839,7 @@ static int _add_job_to_res(struct job_record *job_ptr, int action) /* update the node state */ node_st: for (i = 0; i < select_node_cnt; i++) { if (bit_test(job->node_bitmap, i)) - select_node_record[i].node_state =job->node_req; + select_node_usage[i].node_state = job->node_req; } #if (CR_DEBUG) info("DEBUG: _add_job_to_res:"); @@ -864,7 +886,9 @@ static int _is_node_free(struct part_res_record *p_ptr, uint32_t node_i) * if action = 2 then only subtract cores (job is suspended) * */ -static int _rm_job_from_res(struct job_record *job_ptr, int action) +static int _rm_job_from_res(struct part_res_record *part_record_ptr, + struct node_use_record *node_usage, + struct job_record *job_ptr, int action) { struct select_job_res *job = job_ptr->select_job; int i, n; @@ -885,15 +909,15 @@ static int _rm_job_from_res(struct job_record *job_ptr, int action) for (i = 0, n = 0; i < select_node_cnt; i++) { if (!bit_test(job->node_bitmap, i)) continue; - if (select_node_record[i].alloc_memory < - job->memory_allocated[n]) { + if (node_usage[i].alloc_memory < + job->memory_allocated[n]) { error("error: %s mem is underalloc'd(%u-%u)", select_node_record[i].node_ptr->name, - select_node_record[i].alloc_memory, + node_usage[i].alloc_memory, job->memory_allocated[n]); - select_node_record[i].alloc_memory = 0; + node_usage[i].alloc_memory = 0; } else { - select_node_record[i].alloc_memory -= + node_usage[i].alloc_memory -= job->memory_allocated[n]; } n++; @@ -905,10 +929,13 @@ static int _rm_job_from_res(struct job_record *job_ptr, int action) /* reconstruct rows with remaining jobs */ struct part_res_record *p_ptr; - p_ptr = _get_cr_part_ptr(job_ptr->part_ptr); + for (p_ptr = part_record_ptr; p_ptr; p_ptr = p_ptr->next) { + if (strcmp(p_ptr->name, job_ptr->part_ptr->name) == 0) + break; + } if (!p_ptr) { error("error: 'rm' could not find part %s", - job_ptr->part_ptr); + job_ptr->part_ptr->name); return SLURM_ERROR; } @@ -950,11 +977,11 @@ static int _rm_job_from_res(struct job_record *job_ptr, int action) for (n = 0; n < select_node_cnt; n++) { if (bit_test(job->node_bitmap, n) == 0) continue; - if (select_node_record[n].node_state == + if (node_usage[n].node_state == NODE_CR_AVAILABLE) continue; if (_is_node_free(select_part_record, n)) - select_node_record[n].node_state = + node_usage[n].node_state = NODE_CR_AVAILABLE; } } @@ -990,7 +1017,9 @@ extern int init(void) extern int fini(void) { - _destroy_node_data(); + _destroy_node_data(select_node_usage, select_node_record); + select_node_record = NULL; + select_node_usage = NULL; _destroy_part_data(select_part_record); select_part_record = NULL; xfree(cr_node_num_cores); @@ -1066,9 +1095,10 @@ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) /* initial global core data structures */ _init_global_core_data(node_ptr, node_cnt); - _destroy_node_data(); + _destroy_node_data(select_node_usage, select_node_record); select_node_cnt = node_cnt; select_node_record = xmalloc(node_cnt * sizeof(struct node_res_record)); + select_node_usage = xmalloc(node_cnt * sizeof(struct node_use_record)); select_fast_schedule = slurm_get_fast_schedule(); for (i = 0; i < select_node_cnt; i++) { @@ -1090,7 +1120,7 @@ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) select_node_record[i].real_memory = node_ptr[i].real_memory; } - select_node_record[i].node_state = NODE_CR_AVAILABLE; + select_node_usage[i].node_state = NODE_CR_AVAILABLE; } return SLURM_SUCCESS; @@ -1200,7 +1230,8 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t * bitmap, } else { rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, mode, cr_type, job_node_req, - select_node_cnt, select_part_record); + select_node_cnt, select_part_record, + select_node_usage); } #if (CR_DEBUG) @@ -1246,8 +1277,6 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, ListIterator job_iterator; bitstr_t *orig_map; int rc = SLURM_ERROR; - uint32_t i, *orig_alloc_mem; - uint8_t *orig_node_state; time_t now = time(NULL); orig_map = bit_copy(bitmap); @@ -1255,7 +1284,8 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, /* Try to run with currently available nodes */ rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, cr_type, job_node_req, - select_node_cnt, select_part_record); + select_node_cnt, select_part_record, + select_node_usage); if (rc == SLURM_SUCCESS) { bit_free(orig_map); job_ptr->start_time = time(NULL); @@ -1270,14 +1300,11 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, bit_free(orig_map); return SLURM_ERROR; } - /* Need to preserve node_res_record->alloc_memory and - * node_res_record->node_state, which will both be - * altered when jobs are removed */ - orig_alloc_mem = xmalloc(select_node_cnt * sizeof(uint32_t)); - orig_node_state = xmalloc(select_node_cnt * sizeof(uint8_t)); - for (i = 0; i < select_node_cnt; i++) { - orig_alloc_mem[i] = select_node_record[i].alloc_memory; - orig_node_state[i] = select_node_record[i].node_state; + future_usage = _dup_node_usage(select_node_usage); + if (future_usage == NULL) { + _destroy_part_data(future_part); + bit_free(orig_map); + return SLURM_ERROR; } /* Build list of running jobs */ @@ -1302,11 +1329,12 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, job_iterator = list_iterator_create(cr_job_list); while ((tmp_job_pptr = (struct job_record **) list_next(job_iterator))) { tmp_job_ptr = *tmp_job_pptr; - _rm_job_from_res(tmp_job_ptr, 0); + _rm_job_from_res(future_part, future_usage, tmp_job_ptr, 0); bit_or(bitmap, orig_map); rc = cr_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, cr_type, - job_node_req, select_node_cnt, future_part); + job_node_req, select_node_cnt, future_part, + future_usage); if (rc == SLURM_SUCCESS) { if (tmp_job_ptr->end_time <= now) job_ptr->start_time = now + 1; @@ -1318,12 +1346,7 @@ static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap, list_iterator_destroy(job_iterator); list_destroy(cr_job_list); _destroy_part_data(future_part); - for(i = 0; i < select_node_cnt; i++) { - select_node_record[i].alloc_memory = orig_alloc_mem[i]; - select_node_record[i].node_state = orig_node_state[i]; - } - xfree(orig_alloc_mem); - xfree(orig_node_state); + _destroy_node_data(future_usage, NULL); bit_free(orig_map); return rc; } @@ -1343,7 +1366,7 @@ extern int select_p_job_fini(struct job_record *job_ptr) xassert(job_ptr); xassert(job_ptr->magic == JOB_MAGIC); - _rm_job_from_res(job_ptr, 0); + _rm_job_from_res(select_part_record, select_node_usage, job_ptr, 0); return SLURM_SUCCESS; } @@ -1355,7 +1378,8 @@ extern int select_p_job_suspend(struct job_record *job_ptr) { xassert(job_ptr); - return _rm_job_from_res(job_ptr, 2); + return _rm_job_from_res(select_part_record, select_node_usage, + job_ptr, 2); } /* See NOTE with select_p_job_suspend above */ @@ -1459,12 +1483,12 @@ static uint16_t _is_node_avail(uint32_t node_i) uint32_t i, r, cpu_begin, cpu_end; /* check the node state */ - if (select_node_record[node_i].node_state == NODE_CR_RESERVED) + if (select_node_usage[node_i].node_state == NODE_CR_RESERVED) return (uint16_t) 0; cpu_begin = cr_get_coremap_offset(node_i); cpu_end = cr_get_coremap_offset(node_i+1); - if (select_node_record[node_i].node_state == NODE_CR_ONE_ROW) { + if (select_node_usage[node_i].node_state == NODE_CR_ONE_ROW) { /* check the core_bitmaps in "single-row" partitions */ for (p_ptr = select_part_record; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->num_rows > 1) diff --git a/src/plugins/select/cons_res/select_cons_res.h b/src/plugins/select/cons_res/select_cons_res.h index ead26faf8eb..573b4523064 100644 --- a/src/plugins/select/cons_res/select_cons_res.h +++ b/src/plugins/select/cons_res/select_cons_res.h @@ -91,7 +91,7 @@ struct part_res_record { struct part_res_record *next; /* Ptr to next part_res_record */ }; -/* node resource data, including memory allocation data */ +/* per-node resource data */ struct node_res_record { struct node_record *node_ptr; /* ptr to the actual node */ uint16_t cpus; /* count of processors configured */ @@ -100,16 +100,21 @@ struct node_res_record { uint16_t vpus; /* count of virtual cpus (hyperthreads) * configured per core */ uint32_t real_memory; /* MB of real memory configured */ +}; +/* per-node resource usage record */ +struct node_use_record { enum node_cr_state node_state; /* see node_cr_state comments */ uint32_t alloc_memory; /* real memory reserved by already * scheduled jobs */ }; + extern uint16_t select_fast_schedule; -extern struct node_res_record *select_node_record; extern struct part_res_record *select_part_record; +extern struct node_res_record *select_node_record; +extern struct node_use_record *select_node_usage; extern void cr_sort_part_rows(struct part_res_record *p_ptr); extern uint32_t cr_get_coremap_offset(uint32_t node_index); -- GitLab