Skip to content
Snippets Groups Projects
Commit bdd58a4c authored by Moe Jette's avatar Moe Jette
Browse files

this reverts some earlier changes that appear to cause problems, still not sure why...

parent fa42fc5d
No related branches found
No related tags found
No related merge requests found
...@@ -146,9 +146,11 @@ static uint32_t gs_debug_flags = 0; ...@@ -146,9 +146,11 @@ static uint32_t gs_debug_flags = 0;
static uint16_t gs_fast_schedule = 0; static uint16_t gs_fast_schedule = 0;
static struct gs_part *gs_part_list = NULL; static struct gs_part *gs_part_list = NULL;
static uint32_t default_job_list_size = 64; static uint32_t default_job_list_size = 64;
static uint32_t gs_resmap_size = 0;
static pthread_mutex_t data_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t data_mutex = PTHREAD_MUTEX_INITIALIZER;
static uint16_t *gs_cpus_per_node = NULL; static uint16_t *gs_bits_per_node = NULL;
static uint32_t *gs_bit_rep_count = NULL;
static struct gs_part **gs_part_sorted = NULL; static struct gs_part **gs_part_sorted = NULL;
static uint32_t num_sorted_part = 0; static uint32_t num_sorted_part = 0;
...@@ -187,7 +189,7 @@ static void _print_jobs(struct gs_part *p_ptr) ...@@ -187,7 +189,7 @@ static void _print_jobs(struct gs_part *p_ptr)
if (gs_debug_flags & DEBUG_FLAG_GANG) { if (gs_debug_flags & DEBUG_FLAG_GANG) {
info("gang: part %s has %u jobs, %u shadows:", info("gang: part %s has %u jobs, %u shadows:",
p_ptr->part_name, p_ptr->num_jobs, p_ptr->num_shadows); p_ptr->part_name, p_ptr->num_jobs, p_ptr->num_shadows);
for (i = 0; i < p_ptr->num_shadows; i++) { for (i = 0; i < p_ptr->num_shadows; i++) {
info("gang: shadow job %u row_s %s, sig_s %s", info("gang: shadow job %u row_s %s, sig_s %s",
p_ptr->shadow[i]->job_ptr->job_id, p_ptr->shadow[i]->job_ptr->job_id,
...@@ -204,7 +206,7 @@ static void _print_jobs(struct gs_part *p_ptr) ...@@ -204,7 +206,7 @@ static void _print_jobs(struct gs_part *p_ptr)
int s = bit_size(p_ptr->active_resmap); int s = bit_size(p_ptr->active_resmap);
i = bit_set_count(p_ptr->active_resmap); i = bit_set_count(p_ptr->active_resmap);
info("gang: active resmap has %d of %d bits set", info("gang: active resmap has %d of %d bits set",
i, s); i, s);
} }
} }
} }
...@@ -223,23 +225,28 @@ static uint16_t _get_gr_type(void) ...@@ -223,23 +225,28 @@ static uint16_t _get_gr_type(void)
return GS_NODE; return GS_NODE;
} }
/* For GS_CPU gs_cpus_per_node is the total number of CPUs per node. /* For GS_CPU gs_bits_per_node is the total number of CPUs per node.
* For GS_CORE and GS_SOCKET gs_cpus_per_node is the total number of * For GS_CORE and GS_SOCKET gs_bits_per_node is the total number of
* cores per node. * cores per per node.
* This function also sets gs_resmap_size;
*/ */
static void _load_phys_res_cnt(void) static void _load_phys_res_cnt(void)
{ {
uint16_t bit = 0, sock = 0; uint16_t bit = 0, sock = 0;
uint32_t i; uint32_t i, bit_index = 0;
struct node_record *node_ptr; struct node_record *node_ptr;
xfree(gs_cpus_per_node); xfree(gs_bits_per_node);
xfree(gs_bit_rep_count);
if ((gr_type != GS_CPU) && (gr_type != GS_CORE) && if ((gr_type != GS_CPU) && (gr_type != GS_CORE) &&
(gr_type != GS_SOCKET)) (gr_type != GS_SOCKET))
return; return;
gs_cpus_per_node = xmalloc(node_record_count * sizeof(uint16_t)); gs_bits_per_node = xmalloc(node_record_count * sizeof(uint16_t));
gs_bit_rep_count = xmalloc(node_record_count * sizeof(uint32_t));
gs_resmap_size = 0;
for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
i++, node_ptr++) { i++, node_ptr++) {
if (gr_type == GS_CPU) { if (gr_type == GS_CPU) {
...@@ -256,7 +263,25 @@ static void _load_phys_res_cnt(void) ...@@ -256,7 +263,25 @@ static void _load_phys_res_cnt(void)
bit = node_ptr->cores * sock; bit = node_ptr->cores * sock;
} }
} }
gs_cpus_per_node[i] = bit;
gs_resmap_size += bit;
if (gs_bits_per_node[bit_index] != bit) {
if (gs_bit_rep_count[bit_index] > 0)
bit_index++;
gs_bits_per_node[bit_index] = bit;
}
gs_bit_rep_count[bit_index]++;
}
/* arrays must have trailing 0 */
bit_index += 2;
xrealloc(gs_bits_per_node, bit_index * sizeof(uint16_t));
xrealloc(gs_bit_rep_count, bit_index * sizeof(uint32_t));
bit_index--;
for (i = 0; i < bit_index; i++) {
debug3("gang: _load_phys_res_cnt: grp %d bits %u reps %u",
i, gs_bits_per_node[i], gs_bit_rep_count[i]);
} }
} }
...@@ -302,8 +327,9 @@ static void _destroy_parts(void) ...@@ -302,8 +327,9 @@ static void _destroy_parts(void)
ptr = ptr->next; ptr = ptr->next;
xfree(tmp->part_name); xfree(tmp->part_name);
for (i = 0; i < tmp->num_jobs; i++) for (i = 0; i < tmp->num_jobs; i++) {
xfree(tmp->job_list[i]); xfree(tmp->job_list[i]);
}
xfree(tmp->shadow); xfree(tmp->shadow);
FREE_NULL_BITMAP(tmp->active_resmap); FREE_NULL_BITMAP(tmp->active_resmap);
xfree(tmp->active_cpus); xfree(tmp->active_cpus);
...@@ -411,7 +437,8 @@ static int _job_fits_in_active_row(struct job_record *job_ptr, ...@@ -411,7 +437,8 @@ static int _job_fits_in_active_row(struct job_record *job_ptr,
if ((gr_type == GS_CORE) || (gr_type == GS_SOCKET)) { if ((gr_type == GS_CORE) || (gr_type == GS_SOCKET)) {
return job_fits_into_cores(job_res, p_ptr->active_resmap, return job_fits_into_cores(job_res, p_ptr->active_resmap,
gs_cpus_per_node, NULL); gs_bits_per_node,
gs_bit_rep_count);
} }
/* gr_type == GS_NODE || gr_type == GS_CPU */ /* gr_type == GS_NODE || gr_type == GS_CPU */
...@@ -490,14 +517,14 @@ static void _add_job_to_active(struct job_record *job_ptr, ...@@ -490,14 +517,14 @@ static void _add_job_to_active(struct job_record *job_ptr,
bit_nclear(p_ptr->active_resmap, 0, size-1); bit_nclear(p_ptr->active_resmap, 0, size-1);
} }
add_job_to_cores(job_res, &(p_ptr->active_resmap), add_job_to_cores(job_res, &(p_ptr->active_resmap),
gs_cpus_per_node, NULL); gs_bits_per_node, gs_bit_rep_count);
if (gr_type == GS_SOCKET) if (gr_type == GS_SOCKET)
_fill_sockets(job_res->node_bitmap, p_ptr); _fill_sockets(job_res->node_bitmap, p_ptr);
} else { } else {
/* GS_NODE or GS_CPU */ /* GS_NODE or GS_CPU */
if (!p_ptr->active_resmap) { if (!p_ptr->active_resmap) {
debug3("gang: _add_job_to_active: job %u first", debug3("gang: _add_job_to_active: job %u first",
job_ptr->job_id); job_ptr->job_id);
p_ptr->active_resmap = bit_copy(job_res->node_bitmap); p_ptr->active_resmap = bit_copy(job_res->node_bitmap);
} else if (p_ptr->jobs_active == 0) { } else if (p_ptr->jobs_active == 0) {
debug3("gang: _add_job_to_active: job %u copied", debug3("gang: _add_job_to_active: job %u copied",
...@@ -552,16 +579,16 @@ static int _suspend_job(uint32_t job_id) ...@@ -552,16 +579,16 @@ static int _suspend_job(uint32_t job_id)
suspend_msg_t msg; suspend_msg_t msg;
msg.job_id = job_id; msg.job_id = job_id;
if (gs_debug_flags & DEBUG_FLAG_GANG)
info("gang: suspending %u", job_id);
else
debug("gang: suspending %u", job_id);
msg.op = SUSPEND_JOB; msg.op = SUSPEND_JOB;
rc = job_suspend(&msg, 0, -1, false, (uint16_t)NO_VAL); rc = job_suspend(&msg, 0, -1, false, (uint16_t)NO_VAL);
/* job_suspend() returns ESLURM_DISABLED if job is already suspended */ /* job_suspend() returns ESLURM_DISABLED if job is already suspended */
if ((rc != SLURM_SUCCESS) && (rc != ESLURM_DISABLED)) { if (rc == SLURM_SUCCESS) {
info("gang: suspending job %u: %s", if (gs_debug_flags & DEBUG_FLAG_GANG)
job_id, slurm_strerror(rc)); info("gang: suspending %u", job_id);
else
debug("gang: suspending %u", job_id);
} else if (rc != ESLURM_DISABLED) {
info("gang: suspending job %u: %s", job_id, slurm_strerror(rc));
} }
return rc; return rc;
} }
...@@ -572,15 +599,15 @@ static void _resume_job(uint32_t job_id) ...@@ -572,15 +599,15 @@ static void _resume_job(uint32_t job_id)
suspend_msg_t msg; suspend_msg_t msg;
msg.job_id = job_id; msg.job_id = job_id;
if (gs_debug_flags & DEBUG_FLAG_GANG)
info("gang: resuming %u", job_id);
else
debug("gang: resuming %u", job_id);
msg.op = RESUME_JOB; msg.op = RESUME_JOB;
rc = job_suspend(&msg, 0, -1, false, (uint16_t)NO_VAL); rc = job_suspend(&msg, 0, -1, false, (uint16_t)NO_VAL);
if ((rc != SLURM_SUCCESS) && (rc != ESLURM_ALREADY_DONE)) { if (rc == SLURM_SUCCESS) {
error("gang: resuming job %u: %s", if (gs_debug_flags & DEBUG_FLAG_GANG)
job_id, slurm_strerror(rc)); info("gang: resuming %u", job_id);
else
debug("gang: resuming %u", job_id);
} else if (rc != ESLURM_ALREADY_DONE) {
error("gang: resuming job %u: %s", job_id, slurm_strerror(rc));
} }
} }
...@@ -1132,7 +1159,8 @@ extern int gs_fini(void) ...@@ -1132,7 +1159,8 @@ extern int gs_fini(void)
_destroy_parts(); _destroy_parts();
xfree(gs_part_sorted); xfree(gs_part_sorted);
gs_part_sorted = NULL; gs_part_sorted = NULL;
xfree(gs_cpus_per_node); xfree(gs_bits_per_node);
xfree(gs_bit_rep_count);
pthread_mutex_unlock(&data_mutex); pthread_mutex_unlock(&data_mutex);
debug3("gang: leaving gs_fini"); debug3("gang: leaving gs_fini");
...@@ -1239,8 +1267,10 @@ extern int gs_job_fini(struct job_record *job_ptr) ...@@ -1239,8 +1267,10 @@ extern int gs_job_fini(struct job_record *job_ptr)
* - nodes can be removed from a partition, or added to a partition * - nodes can be removed from a partition, or added to a partition
* - this affects the size of the active resmap * - this affects the size of the active resmap
* *
* If nodes have been added or removed, we need to resize the existing resmaps * If nodes have been added or removed, then the node_record_count
* to prevent errors when comparing them. * will be different from gs_resmap_size. In this case, we need
* to resize the existing resmaps to prevent errors when comparing
* them.
* *
* Here's the plan: * Here's the plan:
* 1. save a copy of the global structures, and then construct * 1. save a copy of the global structures, and then construct
...@@ -1323,7 +1353,7 @@ extern int gs_reconfig(void) ...@@ -1323,7 +1353,7 @@ extern int gs_reconfig(void)
if (IS_JOB_SUSPENDED(job_ptr) && if (IS_JOB_SUSPENDED(job_ptr) &&
(job_ptr->priority != 0)) { (job_ptr->priority != 0)) {
debug3("resuming job %u apparently suspended " debug3("resuming job %u apparently suspended "
"by gang", job_ptr->job_id); " by gang", job_ptr->job_id);
_resume_job(job_ptr->job_id); _resume_job(job_ptr->job_id);
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment