diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 27bb3c6c20bf5854e6a97e9c4e59521c39ea98fe..3f62051b65e88d2fd64cacf64f618e363a6e8795 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -1000,7 +1000,7 @@ static int _job_expand(struct job_record *from_job_ptr, } node_cnt = bit_set_count(from_job_resrcs_ptr->node_bitmap) + - bit_set_count(to_job_resrcs_ptr->node_bitmap) - + bit_set_count(to_job_resrcs_ptr->node_bitmap) - bit_overlap(from_job_resrcs_ptr->node_bitmap, to_job_resrcs_ptr->node_bitmap); new_job_resrcs_ptr = _create_job_resources(node_cnt); @@ -1042,18 +1042,16 @@ static int _job_expand(struct job_record *from_job_ptr, if (from_node_used) { /* Merge alloc info from both "from" and "to" jobs, * leave "from" job with no allocated CPUs or memory */ - new_job_resrcs_ptr->cpus[new_node_offset] += + new_job_resrcs_ptr->cpus[new_node_offset] = from_job_resrcs_ptr->cpus[from_node_offset]; from_job_resrcs_ptr->cpus[from_node_offset] = 0; - /* new_job_resrcs_ptr->cpus_used[new_node_offset] += + /* new_job_resrcs_ptr->cpus_used[new_node_offset] = from_job_resrcs_ptr-> cpus_used[from_node_offset]; Should be 0 */ - new_job_resrcs_ptr->memory_allocated[new_node_offset]+= + new_job_resrcs_ptr->memory_allocated[new_node_offset] = from_job_resrcs_ptr-> memory_allocated[from_node_offset]; - from_job_resrcs_ptr-> - memory_allocated[from_node_offset] = 0; - /* new_job_resrcs_ptr->memory_used[new_node_offset] += + /* new_job_resrcs_ptr->memory_used[new_node_offset] = from_job_resrcs_ptr-> memory_used[from_node_offset]; Should be 0 */ job_resources_bits_copy(new_job_resrcs_ptr, @@ -1062,41 +1060,14 @@ static int _job_expand(struct job_record *from_job_ptr, from_node_offset); } if (to_node_used) { - /* Merge alloc info from both "from" and "to" jobs */ + /* Merge alloc info from both "from" and "to" jobs. */ new_job_resrcs_ptr->cpus[new_node_offset] += to_job_resrcs_ptr->cpus[to_node_offset]; new_job_resrcs_ptr->cpus_used[new_node_offset] += to_job_resrcs_ptr->cpus_used[to_node_offset]; - if (!from_node_used || - (from_job_ptr->details->pn_min_memory & - MEM_PER_CPU)) { - /* node allocated by one job or allocating - * memory by CPU, add mem allocations */ - new_job_resrcs_ptr-> - memory_allocated[new_node_offset] += - to_job_resrcs_ptr-> - memory_allocated[to_node_offset]; - } else if (from_node_used) { - /* mem allocated by node and both jobs have - * allocations on the same node */ - if (select_node_usage[i].alloc_memory < - to_job_resrcs_ptr-> - memory_allocated[to_node_offset]) { - error("cons_res: node %s memory is " - "under-allocated (%u-%u) for " - "job %u", - node_ptr->name, - select_node_usage[i].alloc_memory, - to_job_resrcs_ptr-> - memory_allocated[to_node_offset], - to_job_ptr->job_id); - select_node_usage[i].alloc_memory = 0; - } else { - select_node_usage[i].alloc_memory -= - to_job_resrcs_ptr-> - memory_allocated[to_node_offset]; - } - } + new_job_resrcs_ptr->memory_allocated[new_node_offset]+= + to_job_resrcs_ptr-> + memory_allocated[to_node_offset]; new_job_resrcs_ptr->memory_used[new_node_offset] += to_job_resrcs_ptr->memory_used[to_node_offset]; job_resources_bits_copy(new_job_resrcs_ptr, diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index c9257f759ece07b4916a4c0b16bb88dd186c4fd9..e661b4205af052068902ffcc1613286e7771c812 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -1553,7 +1553,9 @@ static int _job_expand(struct job_record *from_job_ptr, } node_cnt = bit_set_count(from_job_resrcs_ptr->node_bitmap) + - bit_set_count(to_job_resrcs_ptr->node_bitmap); + bit_set_count(to_job_resrcs_ptr->node_bitmap) - + bit_overlap(from_job_resrcs_ptr->node_bitmap, + to_job_resrcs_ptr->node_bitmap); new_job_resrcs_ptr = _create_job_resources(node_cnt); new_job_resrcs_ptr->ncpus = from_job_resrcs_ptr->ncpus + to_job_resrcs_ptr->ncpus; @@ -1593,66 +1595,35 @@ static int _job_expand(struct job_record *from_job_ptr, if (from_node_used) { /* Merge alloc info from both "from" and "to" jobs, * leave "from" job with no allocated CPUs or memory */ - new_job_resrcs_ptr->cpus[new_node_offset] += + new_job_resrcs_ptr->cpus[new_node_offset] = from_job_resrcs_ptr->cpus[from_node_offset]; from_job_resrcs_ptr->cpus[from_node_offset] = 0; - /* new_job_resrcs_ptr->cpus_used[new_node_offset] += + /* new_job_resrcs_ptr->cpus_used[new_node_offset] = from_job_resrcs_ptr-> cpus_used[from_node_offset]; Should be 0 */ - new_job_resrcs_ptr->memory_allocated[new_node_offset]+= + new_job_resrcs_ptr->memory_allocated[new_node_offset] = from_job_resrcs_ptr-> memory_allocated[from_node_offset]; - from_job_resrcs_ptr-> - memory_allocated[from_node_offset] = 0; - /* new_job_resrcs_ptr->memory_used[new_node_offset] += + /* new_job_resrcs_ptr->memory_used[new_node_offset] = from_job_resrcs_ptr-> memory_used[from_node_offset]; Should be 0 */ - if (to_node_used && - (to_job_ptr->details->shared == 0)) { - if (cr_ptr->nodes[i].exclusive_cnt) - cr_ptr->nodes[i].exclusive_cnt--; - else { - error("select/linear: exclusive_cnt " - "underflow for node %s", - node_ptr->name); - } - } job_resources_bits_copy(new_job_resrcs_ptr, new_node_offset, from_job_resrcs_ptr, from_node_offset); } if (to_node_used) { - /* Merge alloc info from both "from" and "to" jobs */ - new_job_resrcs_ptr->cpus[new_node_offset] += + /* Merge alloc info from both "from" and "to" jobs. */ + + /* DO NOT double count the allocated CPUs for the + * select/linear plugin. */ + new_job_resrcs_ptr->cpus[new_node_offset] = to_job_resrcs_ptr->cpus[to_node_offset]; new_job_resrcs_ptr->cpus_used[new_node_offset] += to_job_resrcs_ptr->cpus_used[to_node_offset]; - if (!from_node_used || - (from_job_ptr->details->pn_min_memory & - MEM_PER_CPU)) { - /* node allocated by one job or allocating - * memory by CPU, add mem allocations */ - new_job_resrcs_ptr-> - memory_allocated[new_node_offset] += - to_job_resrcs_ptr-> - memory_allocated[to_node_offset]; - } else if (from_node_used) { - /* mem allocated by node and both jobs have - * allocations on the same node */ - if (cr_ptr->nodes[i].alloc_memory >= - to_job_resrcs_ptr-> - memory_allocated[to_node_offset]) { - cr_ptr->nodes[i].alloc_memory -= - to_job_resrcs_ptr-> - memory_allocated[to_node_offset]; - } else { - cr_ptr->nodes[i].alloc_memory = 0; - error("select/linear: memory underflow" - " for node %s", - node_ptr->name); - } - } + new_job_resrcs_ptr->memory_allocated[new_node_offset]+= + to_job_resrcs_ptr-> + memory_allocated[to_node_offset]; new_job_resrcs_ptr->memory_used[new_node_offset] += to_job_resrcs_ptr->memory_used[to_node_offset]; job_resources_bits_copy(new_job_resrcs_ptr, @@ -2090,6 +2061,8 @@ static void _init_node_cr(void) /* build partition records */ part_iterator = list_iterator_create(part_list); + if (part_iterator == NULL) + fatal("list_iterator_create: malloc failure"); while ((part_ptr = (struct part_record *) list_next(part_iterator))) { for (i = 0; i < select_node_cnt; i++) { if (part_ptr->node_bitmap == NULL) diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 2af17c1310cf3f215dfcdc81b1614c021107e64f..b95a8e5b4ee1b083511c10a3dd0759cbc229016e 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -934,10 +934,6 @@ extern int test_job_dependency(struct job_record *job_ptr) end_time - now; job_ptr->time_limit /= 60; /* sec to min */ } - if (job_ptr->details && dep_ptr->job_ptr->details) { - job_ptr->details->pn_min_memory = - dep_ptr->job_ptr->details->pn_min_memory; - } if (job_ptr->details && dep_ptr->job_ptr->details) { job_ptr->details->shared = dep_ptr->job_ptr->details->shared; @@ -1088,10 +1084,9 @@ extern int update_job_dependency(struct job_record *job_ptr, char *new_depend) (dep_job_ptr->qos_id != job_ptr->qos_id) || (dep_job_ptr->part_ptr == NULL) || (job_ptr->part_ptr == NULL) || - (job_ptr->part_ptr->max_share != 1) || (dep_job_ptr->part_ptr != job_ptr->part_ptr))) { - /* Expand only one active job in the same QOS - * and partition without shared resources */ + /* Expand only jobs in the same QOS and + * and partition */ rc = ESLURM_DEPENDENCY; break; }