From 5b61a4648b8d1de5b7a1c01f7cc32d01dbbdc999 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 4 May 2009 22:23:45 +0000 Subject: [PATCH] Avoid using powered down nodes when scheduling work if possible. Fix possible invalid memory reference in power save logic. --- NEWS | 6 ++-- src/slurmctld/agent.c | 3 +- src/slurmctld/node_mgr.c | 2 ++ src/slurmctld/node_scheduler.c | 53 ++++++++++++++++++++++++++++++++-- src/slurmctld/power_save.c | 2 ++ src/slurmctld/read_config.c | 24 ++++++++------- src/slurmctld/slurmctld.h | 2 ++ 7 files changed, 75 insertions(+), 17 deletions(-) diff --git a/NEWS b/NEWS index 050a9baf0ee..6b48df4d989 100644 --- a/NEWS +++ b/NEWS @@ -18,9 +18,11 @@ documents those changes that are of interest to users and admins. compute node). -- Add slurmctld and slurmd binding to appropriate communications address based upon NodeAddr, ControllerAddr and BackupAddr configuration - parameters. Patch from Matthieu Hautreux, CEA. - NOTE: Failing on Debian Linux. + parameters. Based upon patch from Matthieu Hautreux, CEA. + NOTE: Fails when SlurmDBD is configured with some configurations. NOTE: You must define BIND_SPECIFIC_ADDR to enable this option. + -- Avoid using powered down nodes when scheduling work if possible. + Fix possible invalid memory reference in power save logic. * Changes in SLURM 1.4.0-pre13 ============================== diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 52572bb5384..7c9fdad50ee 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -1497,7 +1497,8 @@ static bool _batch_launch_defer(queued_request_t *queued_req_ptr) } else if (difftime(now, queued_req_ptr->first_attempt) >= slurm_get_batch_start_timeout()) { error("agent waited too long for node %s to come up, " - "sending batch request anyway..."); + "sending batch request anyway...", + node_ptr->name); queued_req_ptr->last_attempt = (time_t) 0; return false; } diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 197c64a3968..ad9e15ccd2d 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -88,6 +88,7 @@ time_t last_node_update = (time_t) NULL; /* time of last update to * node records */ bitstr_t *avail_node_bitmap = NULL; /* bitmap of available nodes */ bitstr_t *idle_node_bitmap = NULL; /* bitmap of idle nodes */ +bitstr_t *power_node_bitmap = NULL; /* bitmap of powered down nodes */ bitstr_t *share_node_bitmap = NULL; /* bitmap of sharable nodes */ bitstr_t *up_node_bitmap = NULL; /* bitmap of non-down nodes */ @@ -2640,6 +2641,7 @@ void node_fini(void) FREE_NULL_BITMAP(idle_node_bitmap); FREE_NULL_BITMAP(avail_node_bitmap); + FREE_NULL_BITMAP(power_node_bitmap); FREE_NULL_BITMAP(share_node_bitmap); FREE_NULL_BITMAP(up_node_bitmap); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 8cf681a6a81..740866719ee 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1241,17 +1241,18 @@ static int _build_node_list(struct job_record *job_ptr, struct node_set **node_set_pptr, int *node_set_size) { - int node_set_inx, rc; + int i, node_set_inx, power_cnt, rc; struct node_set *node_set_ptr; struct config_record *config_ptr; struct part_record *part_ptr = job_ptr->part_ptr; ListIterator config_iterator; int check_node_config, config_filter = 0; struct job_details *detail_ptr = job_ptr->details; - bitstr_t *usable_node_mask = NULL; + bitstr_t *power_up_bitmap = NULL, *usable_node_mask = NULL; multi_core_data_t *mc_ptr = detail_ptr->mc_ptr; bitstr_t *tmp_feature; time_t when = time(NULL); + uint32_t max_weight = 0; if (job_ptr->resv_name) { /* Limit node selection to those in selected reservation */ @@ -1359,9 +1360,10 @@ static int _build_node_list(struct job_record *job_ptr, node_set_ptr[node_set_inx].cpus_per_node = config_ptr->cpus; node_set_ptr[node_set_inx].real_memory = - config_ptr->real_memory; + config_ptr->real_memory; node_set_ptr[node_set_inx].weight = config_ptr->weight; + max_weight = MAX(max_weight, config_ptr->weight); node_set_ptr[node_set_inx].features = xstrdup(config_ptr->feature); node_set_ptr[node_set_inx].feature_array = @@ -1389,6 +1391,51 @@ static int _build_node_list(struct job_record *job_ptr, return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; } + /* If any nodes are powered down, put them into a new node_set + * record with a higher scheduling weight (avoids using powered + * down nodes where possible). */ + for (i = (node_set_inx-1); i >= 0; i--) { + power_cnt = bit_overlap(node_set_ptr[i].my_bitmap, + power_node_bitmap); + if (power_cnt == 0) + continue; /* no nodes powered down */ + if (power_cnt == node_set_ptr[i].nodes) { + node_set_ptr[i].weight += max_weight; + continue; /* all nodes powered down */ + } + + /* Some nodes powered down, others up, split record */ + node_set_ptr[node_set_inx].cpus_per_node = + node_set_ptr[i].cpus_per_node; + node_set_ptr[node_set_inx].real_memory = + node_set_ptr[i].real_memory; + node_set_ptr[node_set_inx].nodes = power_cnt; + node_set_ptr[i].nodes -= power_cnt; + node_set_ptr[node_set_inx].weight = + node_set_ptr[i].weight += max_weight; + node_set_ptr[node_set_inx].features = + xstrdup(node_set_ptr[i].features); + node_set_ptr[node_set_inx].feature_array = + node_set_ptr[i].feature_array; + node_set_ptr[node_set_inx].feature_bits = + bit_copy(node_set_ptr[i].feature_bits); + node_set_ptr[node_set_inx].my_bitmap = + bit_copy(node_set_ptr[i].my_bitmap); + bit_and(node_set_ptr[node_set_inx].my_bitmap, + power_node_bitmap); + if (power_up_bitmap == NULL) { + power_up_bitmap = bit_copy(power_node_bitmap); + bit_not(power_up_bitmap); + } + bit_and(node_set_ptr[i].my_bitmap, power_up_bitmap); + + node_set_inx++; + xrealloc(node_set_ptr, + sizeof(struct node_set) * (node_set_inx + 2)); + node_set_ptr[node_set_inx + 1].my_bitmap = NULL; + } + FREE_NULL_BITMAP(power_up_bitmap); + *node_set_size = node_set_inx; *node_set_pptr = node_set_ptr; return SLURM_SUCCESS; diff --git a/src/slurmctld/power_save.c b/src/slurmctld/power_save.c index 80465350fad..17d2af7146a 100644 --- a/src/slurmctld/power_save.c +++ b/src/slurmctld/power_save.c @@ -119,6 +119,7 @@ static void _do_power_work(void) wake_cnt++; suspend_cnt++; node_ptr->node_state &= (~NODE_STATE_POWER_SAVE); + bit_clear(power_node_bitmap, i); node_ptr->node_state |= NODE_STATE_NO_RESPOND; node_ptr->last_response = now; bit_set(wake_node_bitmap, i); @@ -137,6 +138,7 @@ static void _do_power_work(void) sleep_cnt++; resume_cnt++; node_ptr->node_state |= NODE_STATE_POWER_SAVE; + bit_clear(power_node_bitmap, i); bit_set(sleep_node_bitmap, i); } } diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 1d6cc344c73..f97757648a6 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -183,14 +183,17 @@ static int _build_bitmaps(void) /* initialize the idle and up bitmaps */ FREE_NULL_BITMAP(idle_node_bitmap); FREE_NULL_BITMAP(avail_node_bitmap); + FREE_NULL_BITMAP(power_node_bitmap); FREE_NULL_BITMAP(share_node_bitmap); FREE_NULL_BITMAP(up_node_bitmap); idle_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); avail_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); + power_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); share_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); up_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); if ((idle_node_bitmap == NULL) || (avail_node_bitmap == NULL) || + (power_node_bitmap == NULL) || (share_node_bitmap == NULL) || (up_node_bitmap == NULL)) fatal ("bit_alloc malloc failure"); @@ -235,17 +238,15 @@ static int _build_bitmaps(void) * their configuration, resync DRAINED vs. DRAINING state */ for (i = 0; i < node_record_count; i++) { uint16_t base_state, drain_flag, no_resp_flag, job_cnt; + struct node_record *node_ptr = node_record_table_ptr + i; - if (node_record_table_ptr[i].name[0] == '\0') + if (node_ptr->name[0] == '\0') continue; /* defunct */ - base_state = node_record_table_ptr[i].node_state & - NODE_STATE_BASE; - drain_flag = node_record_table_ptr[i].node_state & + base_state = node_ptr->node_state & NODE_STATE_BASE; + drain_flag = node_ptr->node_state & (NODE_STATE_DRAIN | NODE_STATE_FAIL); - no_resp_flag = node_record_table_ptr[i].node_state & - NODE_STATE_NO_RESPOND; - job_cnt = node_record_table_ptr[i].run_job_cnt + - node_record_table_ptr[i].comp_job_cnt; + no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND; + job_cnt = node_ptr->run_job_cnt + node_ptr->comp_job_cnt; if (((base_state == NODE_STATE_IDLE) && (job_cnt == 0)) || (base_state == NODE_STATE_DOWN)) @@ -256,9 +257,10 @@ static int _build_bitmaps(void) bit_set(avail_node_bitmap, i); bit_set(up_node_bitmap, i); } - if (node_record_table_ptr[i].config_ptr) - bit_set(node_record_table_ptr[i].config_ptr-> - node_bitmap, i); + if (node_ptr->node_state & NODE_STATE_POWER_SAVE) + bit_set(power_node_bitmap, i); + if (node_ptr->config_ptr) + bit_set(node_ptr->config_ptr->node_bitmap, i); } return error_code; } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index c39b9796f4b..bd070ccaea0 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -251,6 +251,7 @@ extern bool ping_nodes_now; /* if set, ping nodes immediately */ * avail_node_bitmap Set if node's state is not DOWN, DRAINING/DRAINED, * FAILING or NO_RESPOND (i.e. available to run a job) * idle_node_bitmap Set if node has no jobs allocated to it + * power_node_bitmap Set for nodes which are powered down * share_node_bitmap Set if any job allocated resources on that node * is configured to not share the node with other * jobs (--exclusive option specified by job or @@ -260,6 +261,7 @@ extern bool ping_nodes_now; /* if set, ping nodes immediately */ extern bitstr_t *avail_node_bitmap; /* bitmap of available nodes, * state not DOWN, DRAIN or FAILING */ extern bitstr_t *idle_node_bitmap; /* bitmap of idle nodes */ +extern bitstr_t *power_node_bitmap; /* Powered down nodes */ extern bitstr_t *share_node_bitmap; /* bitmap of sharable nodes */ extern bitstr_t *up_node_bitmap; /* bitmap of up nodes, not DOWN */ -- GitLab