diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 2489319a09bff7d370802832c5444807d9c3b3de..0c3c275a788fbb4039f7b3c0c2f26f8674eb8fb3 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -1021,10 +1021,22 @@ _pick_step_nodes (struct job_record *job_ptr, goto cleanup; } if (!bit_super_set(selected_nodes, nodes_avail)) { - *return_code = ESLURM_INVALID_TASK_MEMORY; - info ("_pick_step_nodes: requested nodes %s " - "have inadequate memory", - step_spec->node_list); + /* + * If some nodes still have some memory allocated + * to other steps, just defer the execution of the + * step + */ + if (mem_blocked_nodes == 0) { + *return_code = ESLURM_INVALID_TASK_MEMORY; + info ("_pick_step_nodes: requested nodes %s " + "have inadequate memory", + step_spec->node_list); + } else { + *return_code = ESLURM_NODES_BUSY; + info ("_pick_step_nodes: some requested nodes" + " %s still have memory used by other steps", + step_spec->node_list); + } FREE_NULL_BITMAP(selected_nodes); goto cleanup; } @@ -1168,7 +1180,7 @@ _pick_step_nodes (struct job_record *job_ptr, } if (step_spec->min_nodes) { - int cpus_needed, node_avail_cnt, node_idle_cnt, nodes_needed; + int cpus_needed, node_avail_cnt, nodes_needed; if (usable_cpu_cnt == NULL) { usable_cpu_cnt = xmalloc(sizeof(uint32_t) * @@ -1218,10 +1230,6 @@ _pick_step_nodes (struct job_record *job_ptr, nodes_needed = 0; } } - if (nodes_idle) - node_idle_cnt = bit_set_count(nodes_idle); - else - node_idle_cnt = 0; if (nodes_avail) node_avail_cnt = bit_set_count(nodes_avail); else @@ -1241,10 +1249,9 @@ _pick_step_nodes (struct job_record *job_ptr, usable_cpu_cnt); if (node_tmp == NULL) { int avail_node_cnt = bit_set_count(nodes_avail); - if ((avail_node_cnt < node_idle_cnt) && - (step_spec->min_nodes <= - (avail_node_cnt + nodes_picked_cnt + - mem_blocked_nodes))) { + if (step_spec->min_nodes <= + (avail_node_cnt + nodes_picked_cnt + + mem_blocked_nodes)) { *return_code = ESLURM_NODES_BUSY; } else if (!bit_super_set(job_ptr->node_bitmap, up_node_bitmap)) { diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index cac1b88d7259150cee2cd7e5f41638dabe742eb0..67ad1da1295247e120e2f3d2cb301d5b2b89d560 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -1776,8 +1776,11 @@ _wait_for_all_tasks(slurmd_job_t *job) static void *_kill_thr(void *args) { kill_thread_t *kt = ( kill_thread_t *) args; - sleep(kt->secs); - pthread_kill(kt->thread_id, SIGKILL); + unsigned int pause = kt->secs; + do { + pause = sleep(pause); + } while (pause > 0); + pthread_cancel(kt->thread_id); xfree(kt); return NULL; }