From f6d42fdbb293ca89da609779db8d8c04a86a8d13 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Mon, 13 Feb 2017 11:27:59 -0700 Subject: [PATCH] Don't start job early Insure job does not start running before node is booted and PrologSlurmctld is complete. bug 3446 --- NEWS | 2 ++ src/slurmctld/job_mgr.c | 11 +++++------ src/slurmctld/job_scheduler.c | 13 ++++++++----- src/slurmctld/node_scheduler.c | 2 +- src/slurmctld/power_save.c | 9 ++++++++- src/slurmctld/slurmctld.h | 4 ++++ 6 files changed, 28 insertions(+), 13 deletions(-) diff --git a/NEWS b/NEWS index 71285be629d..4ca7ca1a4c7 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,8 @@ documents those changes that are of interest to users and administrators. -- burst_buffer/cray - Support default pool which is not the first pool reported by DataWarp and log in Slurm when pools that are added or removed from DataWarp. + -- Insure job does not start running before node is booted and PrologSlurmctld + is complete. * Changes in Slurm 16.05.9 ========================== diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index c63cc1156a3..5ce62443236 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -7481,8 +7481,9 @@ extern void job_config_fini(struct job_record *job_ptr) } } -#ifndef HAVE_BG -static bool _test_nodes_ready(struct job_record *job_ptr) +/* Determine of the nodes are ready to run a job + * RET true if ready */ +extern bool test_job_nodes_ready(struct job_record *job_ptr) { if (bit_overlap(job_ptr->node_bitmap, power_node_bitmap)) return false; @@ -7504,7 +7505,6 @@ static bool _test_nodes_ready(struct job_record *job_ptr) return true; } -#endif /* * Modify a job's memory limit if allocated all memory on a node and the node @@ -7576,7 +7576,7 @@ void job_time_limit(void) if (job_ptr->details) prolog = job_ptr->details->prolog_running; if ((prolog == 0) && IS_JOB_CONFIGURING(job_ptr) && - _test_nodes_ready(job_ptr)) { + test_job_nodes_ready(job_ptr)) { info("%s: Configuration for job %u is complete", __func__, job_ptr->job_id); job_config_fini(job_ptr); @@ -12860,7 +12860,6 @@ job_alloc_info(uint32_t uid, uint32_t job_id, struct job_record **job_pptr) (prolog == 0) && job_ptr->node_bitmap && (bit_overlap(power_node_bitmap, job_ptr->node_bitmap) == 0)) { last_job_update = time(NULL); - job_ptr->job_state &= (~JOB_CONFIGURING); set_job_alias_list(job_ptr); } @@ -13586,7 +13585,7 @@ extern int job_node_ready(uint32_t job_id, int *ready) job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") && job_ptr->node_bitmap && (bit_overlap(power_node_bitmap, job_ptr->node_bitmap) == 0)) { - job_ptr->job_state &= (~JOB_CONFIGURING); + last_job_update = time(NULL); set_job_alias_list(job_ptr); } diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 33973e5c4ca..6ffbb8800b6 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -3933,11 +3933,14 @@ extern void prolog_running_decr(struct job_record *job_ptr) (--job_ptr->details->prolog_running > 0)) return; - job_ptr->job_state &= ~JOB_CONFIGURING; - if (job_ptr->batch_flag && - ((job_ptr->bit_flags & NODE_REBOOT) == 0) && - (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { - launch_job(job_ptr); + if (IS_JOB_CONFIGURING(job_ptr) && test_job_nodes_ready(job_ptr)) { + info("%s: Configuration for job %u is complete", + __func__, job_ptr->job_id); + job_config_fini(job_ptr); + if (job_ptr->batch_flag && + (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { + launch_job(job_ptr); + } } } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 87c2c68da86..bd622c00a4d 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -2609,8 +2609,8 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, power_g_job_start(job_ptr); if (configuring || - bit_overlap(job_ptr->node_bitmap, power_node_bitmap) || !bit_super_set(job_ptr->node_bitmap, avail_node_bitmap)) { + /* This handles nodes explicitly requesting node reboot */ job_ptr->job_state |= JOB_CONFIGURING; } diff --git a/src/slurmctld/power_save.c b/src/slurmctld/power_save.c index 8f5d365978e..13494b9f625 100644 --- a/src/slurmctld/power_save.c +++ b/src/slurmctld/power_save.c @@ -296,8 +296,14 @@ extern int power_job_reboot(struct job_record *job_ptr) pid_t pid; boot_node_bitmap = node_features_reboot(job_ptr); - if (boot_node_bitmap == NULL) + if (boot_node_bitmap == NULL) { + /* Powered down nodes require reboot */ + if (bit_overlap(power_node_bitmap, job_ptr->node_bitmap)) { + job_ptr->job_state |= JOB_CONFIGURING; + job_ptr->bit_flags |= NODE_REBOOT; + } return SLURM_SUCCESS; + } i_first = bit_ffs(boot_node_bitmap); if (i_first >= 0) @@ -322,6 +328,7 @@ extern int power_job_reboot(struct job_record *job_ptr) nodes = bitmap2node_name(boot_node_bitmap); if (nodes) { + /* Reboot nodes to change KNL NUMA and/or MCDRAM mode */ job_ptr->job_state |= JOB_CONFIGURING; job_ptr->wait_all_nodes = 1; job_ptr->bit_flags |= NODE_REBOOT; diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 043b033d808..abd2e8d8701 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -2196,6 +2196,10 @@ extern bool test_job_array_finished(uint32_t array_job_id); /* Return true if ANY tasks of specific array job ID are pending */ extern bool test_job_array_pending(uint32_t array_job_id); +/* Determine of the nodes are ready to run a job + * RET true if ready */ +extern bool test_job_nodes_ready(struct job_record *job_ptr); + /* * Synchronize the batch job in the system with their files. * All pending batch jobs must have script and environment files -- GitLab