From 9c1572946aacadc59adf71b2bbdff22b493eff40 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 15 Mar 2007 23:56:14 +0000 Subject: [PATCH] Improve node scheduling logic given that select_g_job_test() is destructive of the bitmap argument. --- src/slurmctld/node_scheduler.c | 38 ++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 68d8aec932c..978814cdeb1 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -291,36 +291,43 @@ _pick_best_load(struct job_record *job_ptr, bitstr_t * bitmap, _node_load_bitmaps(bitmap, &no_load_bit, &light_load_bit, &heavy_load_bit); - - /* first try to use idle nodes */ - bit_and(bitmap, no_load_bit); - FREE_NULL_BITMAP(no_load_bit); + /* always include required nodes or selection algorithm fails, * note that we have already confirmed these nodes are available * to this job */ if (job_ptr->details && job_ptr->details->req_node_bitmap) - bit_or(bitmap, job_ptr->details->req_node_bitmap); - + bit_or(no_load_bit, job_ptr->details->req_node_bitmap); + + /* NOTE: select_g_job_test() is destructive of bitmap */ + + /* first try to use idle nodes */ + bit_and(bitmap, no_load_bit); error_code = select_g_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, test_only); /* now try to use idle and lightly loaded nodes */ if (error_code) { + bit_nclear(bitmap, 0, (node_record_count-1)); + bit_or(bitmap, no_load_bit); bit_or(bitmap, light_load_bit); error_code = select_g_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, test_only); } - FREE_NULL_BITMAP(light_load_bit); /* now try to use all possible nodes */ if (error_code) { + bit_nclear(bitmap, 0, (node_record_count-1)); + bit_or(bitmap, no_load_bit); + bit_or(bitmap, light_load_bit); bit_or(bitmap, heavy_load_bit); error_code = select_g_job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, test_only); } + FREE_NULL_BITMAP(no_load_bit); + FREE_NULL_BITMAP(light_load_bit); FREE_NULL_BITMAP(heavy_load_bit); return error_code; @@ -452,6 +459,7 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, int avail_nodes = 0, avail_cpus = 0; int avail_mem = 0; /* avail_: resources available for use now */ bitstr_t *avail_bitmap = NULL, *total_bitmap = NULL; + bitstr_t *backup_bitmap = NULL; bitstr_t *partially_idle_node_bitmap = NULL, *possible_bitmap = NULL; int max_feature, min_feature; bool runable_ever = false; /* Job can ever run */ @@ -612,10 +620,6 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, } for (j = min_feature; j <= max_feature; j++) { - /* we use this var to go straight down the list if the - * first one doesn't work we go to the next until the - * list is empty. - */ for (i = 0; i < node_set_size; i++) { bool pick_light_load = false; if (node_set_ptr[i].feature != j) @@ -707,6 +711,10 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, continue; /* Keep accumulating nodes */ if (avail_cpus < job_ptr->num_procs) continue; /* Keep accumulating CPUs */ + + /* NOTE: select_g_job_test() is destructive of + * avail_bitmap, so save a backup copy */ + backup_bitmap = bit_copy(avail_bitmap); if (pick_light_load) { pick_code = _pick_best_load(job_ptr, avail_bitmap, @@ -722,8 +730,9 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, req_nodes, false); } - + if (pick_code == SLURM_SUCCESS) { + FREE_NULL_BITMAP(backup_bitmap); if (bit_set_count(avail_bitmap) > max_nodes) { /* end of tests for this feature */ avail_nodes = 0; @@ -738,11 +747,8 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, *select_bitmap = avail_bitmap; return SLURM_SUCCESS; } else { - /* reset the counters and start from the - * next node in the list */ FREE_NULL_BITMAP(avail_bitmap); - avail_nodes = 0; - avail_cpus = 0; + avail_bitmap = backup_bitmap; } } -- GitLab