From bdd58a4c077709fe5c1c4564a8f4d94d79bc0445 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 16 Jun 2010 22:45:42 +0000 Subject: [PATCH] this reverts some earlier changes that appear to cause problems, still not sure why... --- src/slurmctld/gang.c | 94 +++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 32 deletions(-) diff --git a/src/slurmctld/gang.c b/src/slurmctld/gang.c index 86925752fa0..1ef1273d447 100644 --- a/src/slurmctld/gang.c +++ b/src/slurmctld/gang.c @@ -146,9 +146,11 @@ static uint32_t gs_debug_flags = 0; static uint16_t gs_fast_schedule = 0; static struct gs_part *gs_part_list = NULL; static uint32_t default_job_list_size = 64; +static uint32_t gs_resmap_size = 0; static pthread_mutex_t data_mutex = PTHREAD_MUTEX_INITIALIZER; -static uint16_t *gs_cpus_per_node = NULL; +static uint16_t *gs_bits_per_node = NULL; +static uint32_t *gs_bit_rep_count = NULL; static struct gs_part **gs_part_sorted = NULL; static uint32_t num_sorted_part = 0; @@ -187,7 +189,7 @@ static void _print_jobs(struct gs_part *p_ptr) if (gs_debug_flags & DEBUG_FLAG_GANG) { info("gang: part %s has %u jobs, %u shadows:", - p_ptr->part_name, p_ptr->num_jobs, p_ptr->num_shadows); + p_ptr->part_name, p_ptr->num_jobs, p_ptr->num_shadows); for (i = 0; i < p_ptr->num_shadows; i++) { info("gang: shadow job %u row_s %s, sig_s %s", p_ptr->shadow[i]->job_ptr->job_id, @@ -204,7 +206,7 @@ static void _print_jobs(struct gs_part *p_ptr) int s = bit_size(p_ptr->active_resmap); i = bit_set_count(p_ptr->active_resmap); info("gang: active resmap has %d of %d bits set", - i, s); + i, s); } } } @@ -223,23 +225,28 @@ static uint16_t _get_gr_type(void) return GS_NODE; } -/* For GS_CPU gs_cpus_per_node is the total number of CPUs per node. - * For GS_CORE and GS_SOCKET gs_cpus_per_node is the total number of - * cores per node. +/* For GS_CPU gs_bits_per_node is the total number of CPUs per node. + * For GS_CORE and GS_SOCKET gs_bits_per_node is the total number of + * cores per per node. + * This function also sets gs_resmap_size; */ static void _load_phys_res_cnt(void) { uint16_t bit = 0, sock = 0; - uint32_t i; + uint32_t i, bit_index = 0; struct node_record *node_ptr; - xfree(gs_cpus_per_node); + xfree(gs_bits_per_node); + xfree(gs_bit_rep_count); if ((gr_type != GS_CPU) && (gr_type != GS_CORE) && (gr_type != GS_SOCKET)) return; - gs_cpus_per_node = xmalloc(node_record_count * sizeof(uint16_t)); + gs_bits_per_node = xmalloc(node_record_count * sizeof(uint16_t)); + gs_bit_rep_count = xmalloc(node_record_count * sizeof(uint32_t)); + + gs_resmap_size = 0; for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { if (gr_type == GS_CPU) { @@ -256,7 +263,25 @@ static void _load_phys_res_cnt(void) bit = node_ptr->cores * sock; } } - gs_cpus_per_node[i] = bit; + + gs_resmap_size += bit; + if (gs_bits_per_node[bit_index] != bit) { + if (gs_bit_rep_count[bit_index] > 0) + bit_index++; + gs_bits_per_node[bit_index] = bit; + } + gs_bit_rep_count[bit_index]++; + } + + /* arrays must have trailing 0 */ + bit_index += 2; + xrealloc(gs_bits_per_node, bit_index * sizeof(uint16_t)); + xrealloc(gs_bit_rep_count, bit_index * sizeof(uint32_t)); + + bit_index--; + for (i = 0; i < bit_index; i++) { + debug3("gang: _load_phys_res_cnt: grp %d bits %u reps %u", + i, gs_bits_per_node[i], gs_bit_rep_count[i]); } } @@ -302,8 +327,9 @@ static void _destroy_parts(void) ptr = ptr->next; xfree(tmp->part_name); - for (i = 0; i < tmp->num_jobs; i++) + for (i = 0; i < tmp->num_jobs; i++) { xfree(tmp->job_list[i]); + } xfree(tmp->shadow); FREE_NULL_BITMAP(tmp->active_resmap); xfree(tmp->active_cpus); @@ -411,7 +437,8 @@ static int _job_fits_in_active_row(struct job_record *job_ptr, if ((gr_type == GS_CORE) || (gr_type == GS_SOCKET)) { return job_fits_into_cores(job_res, p_ptr->active_resmap, - gs_cpus_per_node, NULL); + gs_bits_per_node, + gs_bit_rep_count); } /* gr_type == GS_NODE || gr_type == GS_CPU */ @@ -490,14 +517,14 @@ static void _add_job_to_active(struct job_record *job_ptr, bit_nclear(p_ptr->active_resmap, 0, size-1); } add_job_to_cores(job_res, &(p_ptr->active_resmap), - gs_cpus_per_node, NULL); + gs_bits_per_node, gs_bit_rep_count); if (gr_type == GS_SOCKET) _fill_sockets(job_res->node_bitmap, p_ptr); } else { /* GS_NODE or GS_CPU */ if (!p_ptr->active_resmap) { debug3("gang: _add_job_to_active: job %u first", - job_ptr->job_id); + job_ptr->job_id); p_ptr->active_resmap = bit_copy(job_res->node_bitmap); } else if (p_ptr->jobs_active == 0) { debug3("gang: _add_job_to_active: job %u copied", @@ -552,16 +579,16 @@ static int _suspend_job(uint32_t job_id) suspend_msg_t msg; msg.job_id = job_id; - if (gs_debug_flags & DEBUG_FLAG_GANG) - info("gang: suspending %u", job_id); - else - debug("gang: suspending %u", job_id); msg.op = SUSPEND_JOB; rc = job_suspend(&msg, 0, -1, false, (uint16_t)NO_VAL); /* job_suspend() returns ESLURM_DISABLED if job is already suspended */ - if ((rc != SLURM_SUCCESS) && (rc != ESLURM_DISABLED)) { - info("gang: suspending job %u: %s", - job_id, slurm_strerror(rc)); + if (rc == SLURM_SUCCESS) { + if (gs_debug_flags & DEBUG_FLAG_GANG) + info("gang: suspending %u", job_id); + else + debug("gang: suspending %u", job_id); + } else if (rc != ESLURM_DISABLED) { + info("gang: suspending job %u: %s", job_id, slurm_strerror(rc)); } return rc; } @@ -572,15 +599,15 @@ static void _resume_job(uint32_t job_id) suspend_msg_t msg; msg.job_id = job_id; - if (gs_debug_flags & DEBUG_FLAG_GANG) - info("gang: resuming %u", job_id); - else - debug("gang: resuming %u", job_id); msg.op = RESUME_JOB; rc = job_suspend(&msg, 0, -1, false, (uint16_t)NO_VAL); - if ((rc != SLURM_SUCCESS) && (rc != ESLURM_ALREADY_DONE)) { - error("gang: resuming job %u: %s", - job_id, slurm_strerror(rc)); + if (rc == SLURM_SUCCESS) { + if (gs_debug_flags & DEBUG_FLAG_GANG) + info("gang: resuming %u", job_id); + else + debug("gang: resuming %u", job_id); + } else if (rc != ESLURM_ALREADY_DONE) { + error("gang: resuming job %u: %s", job_id, slurm_strerror(rc)); } } @@ -1132,7 +1159,8 @@ extern int gs_fini(void) _destroy_parts(); xfree(gs_part_sorted); gs_part_sorted = NULL; - xfree(gs_cpus_per_node); + xfree(gs_bits_per_node); + xfree(gs_bit_rep_count); pthread_mutex_unlock(&data_mutex); debug3("gang: leaving gs_fini"); @@ -1239,8 +1267,10 @@ extern int gs_job_fini(struct job_record *job_ptr) * - nodes can be removed from a partition, or added to a partition * - this affects the size of the active resmap * - * If nodes have been added or removed, we need to resize the existing resmaps - * to prevent errors when comparing them. + * If nodes have been added or removed, then the node_record_count + * will be different from gs_resmap_size. In this case, we need + * to resize the existing resmaps to prevent errors when comparing + * them. * * Here's the plan: * 1. save a copy of the global structures, and then construct @@ -1323,7 +1353,7 @@ extern int gs_reconfig(void) if (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority != 0)) { debug3("resuming job %u apparently suspended " - "by gang", job_ptr->job_id); + " by gang", job_ptr->job_id); _resume_job(job_ptr->job_id); } -- GitLab