diff --git a/NEWS b/NEWS index 419b09ae5621d9aa415c87d733c8aaaad33c0e66..61e1c51417d7a69eacb33081e5a4eb81cf974efc 100644 --- a/NEWS +++ b/NEWS @@ -21,6 +21,11 @@ documents those changes that are of interest to users and admins. -- sview - There is now a .slurm/sviewrc created when runnning sview. Defaults are put in there as to how sview looks when first launched. You can set these by Ctrl-S or Options->Set Default Settings. + -- Modify srun and salloc so that after creating a resource allocation, they + wait for all allocated nodes to power up before proceeding. Salloc will + log the delay with the messages "Waiting for nodes to boot" and "Nodes are + ready for use". Srun will generate the same messages only if the --verbose + option is used. * Changes in SLURM 2.2.0.pre5 ============================= diff --git a/doc/html/power_save.shtml b/doc/html/power_save.shtml index 0aec4b931307133b1c8789f99cf3ffe6d9f74c00..69e6ff52583c707e2f5fdeb13be222e2f67d8b2a 100644 --- a/doc/html/power_save.shtml +++ b/doc/html/power_save.shtml @@ -173,7 +173,8 @@ nodes are in power save mode using messages of this sort: <p>Using these logs you can easily see the effect of SLURM's power saving support. -You can also configure SLURM with programs that perform no action as <b>SuspendProgram</b> and <b>ResumeProgram</b> to assess the potential +You can also configure SLURM with programs that perform no action as +<b>SuspendProgram</b> and <b>ResumeProgram</b> to assess the potential impact of power saving mode before enabling it.</p> <h2>Use of Allocations</h2> @@ -189,16 +190,20 @@ available).</p> <p>In the case of an <i>sbatch</i> command, the batch program will start when node zero of the allocation is ready for use and pre-processing can be performed as needed before using <i>srun</i> to launch job steps. -The operation of <i>salloc</i> and <i>srun</i> follow a similar pattern -of getting an job allocation at one time, but possibly being unable to -launch job steps until later. -If <i>ssh</i> or some other tools is used by <i>salloc</i> it may be -desirable to execute "<i>srun /bin/true</i>" or some other command -first to insure that all nodes are booted and ready for use. -We plan to add a job and node state of <i>CONFIGURING</i> in SLURM -version 2.1, which could be used to prevent salloc from executing -any processes (including <i>ssh</i>) until all of the nodes are -ready for use.</p> +Waiting for all nodes to be booted can be accomplished by adding the +command "<i>scontrol waitjob $SLURM_JOBID</i>" within the script or by +adding that command to the the system <i>Prolog</i> as configured in +<i>slurm.conf</i>, which would create the delay for all jobs on the system. +Note that the <i>scontrol waitjob</i> command was added to SLURM version 2.2. +When using earlier versions of SLURM, one may execute "<i>srun /bin/true</i>" +or some other command first to insure that all nodes are booted and ready +for use.</p> + +<p>The <i>salloc</i> and <i>srun</i> commands which create a resource +allocation automatically wait for the nodes to power up in SLURM version 2.2. +When using earlier versions of SLURM, <i>salloc</i> will return immediately +after a resource allocation is made and one can execute "<i>srun /bin/true</i>" +to insure that all nodes are booted and ready for use.</p> <h2>Fault Tolerance</h2> @@ -239,6 +244,6 @@ and perform the following actions: <li>Boot the appropriate image for each node</li> </ol> -<p style="text-align:center;">Last modified 6 August 2009</p> +<p style="text-align:center;">Last modified 28 April 2010</p> <!--#include virtual="footer.txt"--> diff --git a/src/api/job_info.c b/src/api/job_info.c index f42621da8beecd5c60deccb9aee5ddf57d284c11..d056d9d1f83cda4762a8ecde9f28f2eeed94f3bd 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -1083,8 +1083,8 @@ extern int slurm_job_node_ready(uint32_t job_id) } else if (resp.msg_type == RESPONSE_SLURM_RC) { int job_rc = ((return_code_msg_t *) resp.data) -> return_code; - if ((job_rc == ESLURM_INVALID_PARTITION_NAME) - || (job_rc == ESLURM_INVALID_JOB_ID)) + if ((job_rc == ESLURM_INVALID_PARTITION_NAME) || + (job_rc == ESLURM_INVALID_JOB_ID)) rc = READY_JOB_FATAL; else /* EAGAIN */ rc = READY_JOB_ERROR; diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 4fb4f71a6a55b6f811798e30712c5bfe02706dd7..7255a7e96acc4ba21454e4e7e19bf0eb905b1dc6 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -228,6 +228,23 @@ uint16_t slurm_get_batch_start_timeout(void) return batch_start_timeout; } +/* slurm_get_suspend_timeout + * RET SuspendTimeout value from slurm.conf + */ +uint16_t slurm_get_suspend_timeout(void) +{ + uint16_t suspend_timeout = 0; + slurm_ctl_conf_t *conf; + + if(slurmdbd_conf) { + } else { + conf = slurm_conf_lock(); + suspend_timeout = conf->suspend_timeout; + slurm_conf_unlock(); + } + return suspend_timeout; +} + /* slurm_get_resume_timeout * RET ResumeTimeout value from slurm.conf */ diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 3a0d22db8a4514b8643ef194d2d5bf892e7d0a57..22d4521773d4d109799c813ab0880a62de57b051 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -103,6 +103,11 @@ inline slurm_protocol_config_t *slurm_get_api_config(void); */ uint16_t slurm_get_batch_start_timeout(void); +/* slurm_get_suspend_timeout + * RET SuspendTimeout value from slurm.conf + */ +uint16_t slurm_get_suspend_timeout(void); + /* slurm_get_resume_timeout * RET ResumeTimeout value from slurm.conf */ diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 2b1953100540a5b0ce34ef7539de16233d672be0..ff12c176ab4d118dc3b9983a4f74bedbf9ae773d 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -1820,9 +1820,26 @@ extern int select_p_job_begin(struct job_record *job_ptr) return SLURM_SUCCESS; } +/* Determine if allocated nodes are usable (powered up) */ extern int select_p_job_ready(struct job_record *job_ptr) { - return SLURM_SUCCESS; + int i, i_first, i_last; + struct node_record *node_ptr; + + if ((job_ptr->node_bitmap == NULL) || + ((i_first = bit_ffs(job_ptr->node_bitmap)) == -1)) + return READY_NODE_STATE; + i_last = bit_fls(job_ptr->node_bitmap); + + for (i=i_first; i<=i_last; i++) { + if (bit_test(job_ptr->node_bitmap, i) == 0) + continue; + node_ptr = node_record_table_ptr + i; + if (IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr)) + return 0; + } + + return READY_NODE_STATE; } extern int select_p_job_resized(struct job_record *job_ptr, diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index ed51ae460542e4bd9e966908541f4974dadf5fe0..8d30ed5738c325b9bdd54aa974b60b47786a3dad 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -2339,12 +2339,26 @@ extern int select_p_job_begin(struct job_record *job_ptr) return rc; } +/* Determine if allocated nodes are usable (powered up) */ extern int select_p_job_ready(struct job_record *job_ptr) { - if (!IS_JOB_RUNNING(job_ptr)) - return 0; + int i, i_first, i_last; + struct node_record *node_ptr; + + if ((job_ptr->node_bitmap == NULL) || + ((i_first = bit_ffs(job_ptr->node_bitmap)) == -1)) + return READY_NODE_STATE; + i_last = bit_fls(job_ptr->node_bitmap); + + for (i=i_first; i<=i_last; i++) { + if (bit_test(job_ptr->node_bitmap, i) == 0) + continue; + node_ptr = node_record_table_ptr + i; + if (IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr)) + return 0; + } - return 1; + return READY_NODE_STATE; } extern int select_p_job_resized(struct job_record *job_ptr, diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index c93fec1859f94b6cc2e39b9696a04122ca82114d..8d36fb961dc64b7bf968ec3cc55391cf1220b7f8 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -77,7 +77,8 @@ #include "src/common/node_select.h" #endif -#define MAX_RETRIES 3 +#define MAX_RETRIES 10 +#define POLL_SLEEP 3 /* retry interval in seconds */ char **command_argv; int command_argc; @@ -109,10 +110,11 @@ static void _ping_handler(srun_ping_msg_t *msg); static void _node_fail_handler(srun_node_fail_msg_t *msg); #ifdef HAVE_BG -#define POLL_SLEEP 3 /* retry interval in seconds */ static int _wait_bluegene_block_ready( resource_allocation_response_msg_t *alloc); static int _blocks_dealloc(void); +#else +static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc); #endif #ifdef HAVE_CRAY_XT @@ -267,6 +269,13 @@ int main(int argc, char *argv[]) "boot of the block."); goto relinquish; } +#else + if (!_wait_nodes_ready(alloc)) { + if(!allocation_interrupted) + error("Something is wrong with the " + "boot of the nodes."); + goto relinquish; + } #endif #ifdef HAVE_CRAY_XT if (!_claim_reservation(alloc)) { @@ -774,7 +783,7 @@ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) if (rc == READY_JOB_FATAL) break; /* fatal error */ - if (rc == READY_JOB_ERROR) /* error */ + if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; @@ -824,7 +833,7 @@ static int _blocks_dealloc(void) } if (error_code) { - error("slurm_load_partitions: %s\n", + error("slurm_load_partitions: %s", slurm_strerror(slurm_get_errno())); return -1; } @@ -838,6 +847,60 @@ static int _blocks_dealloc(void) bg_info_ptr = new_bg_ptr; return rc; } +#else +/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ +static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) +{ + int is_ready = 0, i, rc; + int cur_delay = 0; + int suspend_time, resume_time, max_delay; + + suspend_time = slurm_get_suspend_timeout(); + resume_time = slurm_get_resume_timeout(); + if ((suspend_time == 0) || (resume_time == 0)) + return 1; /* Power save mode disabled */ + max_delay = suspend_time + resume_time; + max_delay *= 5; /* Allow for ResumeRate support */ + + pending_job_id = alloc->job_id; + + for (i=0; (cur_delay < max_delay); i++) { + if (i) { + if (i == 1) + info("Waiting for nodes to boot"); + else + debug("still waiting"); + sleep(POLL_SLEEP); + cur_delay += POLL_SLEEP; + } + + rc = slurm_job_node_ready(alloc->job_id); + + if (rc == READY_JOB_FATAL) + break; /* fatal error */ + if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) + continue; /* retry */ + if ((rc & READY_JOB_STATE) == 0) /* job killed */ + break; + if (rc & READY_NODE_STATE) { /* job and node ready */ + is_ready = 1; + break; + } + if (allocation_interrupted) + break; + } + if (is_ready) { + if (i > 0) + info ("Nodes %s are ready for job", alloc->node_list); + } else if (!allocation_interrupted) + error("Nodes %s are still not ready", alloc->node_list); + else /* allocation_interrupted or slurmctld not responing */ + is_ready = 0; + + pending_job_id = 0; + + return is_ready; +} #endif /* HAVE_BG */ #ifdef HAVE_CRAY_XT diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 11003807097f03a07b324bbc81d4e04f0838202c..e2a16b392a028b230a6f2c59b56c0c7d02e6c897 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1812,18 +1812,16 @@ extern int kill_running_job_by_node_name(char *node_name) } job_ptr->restart_cnt++; /* Since the job completion logger - removes the submit we need to add it - again. - */ + * removes the submit we need to add it + * again. */ acct_policy_add_job_submit(job_ptr); } else { info("Killing job_id %u on failed node %s", job_ptr->job_id, node_name); srun_node_fail(job_ptr->job_id, node_name); job_ptr->job_state = JOB_NODE_FAIL | - JOB_COMPLETING; - job_ptr->exit_code = - MAX(job_ptr->exit_code, 1); + JOB_COMPLETING; + job_ptr->exit_code = MAX(job_ptr->exit_code, 1); job_ptr->state_reason = FAIL_DOWN_NODE; xfree(job_ptr->state_desc); if (suspended) { @@ -1831,8 +1829,7 @@ extern int kill_running_job_by_node_name(char *node_name) job_ptr->suspend_time; job_ptr->tot_sus_time += difftime(now, - job_ptr-> - suspend_time); + job_ptr->suspend_time); } else job_ptr->end_time = time(NULL); deallocate_nodes(job_ptr, false, suspended); diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 3d5fed6f2af631d12ee59c460fb43821bb959e78..e7cb36c190915552b0f8718bc7212f57db41c493 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -73,9 +73,10 @@ #endif -#define MAX_ALLOC_WAIT 60 /* seconds */ -#define MIN_ALLOC_WAIT 5 /* seconds */ -#define MAX_RETRIES 10 +#define MAX_ALLOC_WAIT 60 /* seconds */ +#define MIN_ALLOC_WAIT 5 /* seconds */ +#define MAX_RETRIES 10 +#define POLL_SLEEP 3 /* retry interval in seconds */ pthread_mutex_t msg_lock = PTHREAD_MUTEX_INITIALIZER; pthread_cond_t msg_cond = PTHREAD_COND_INITIALIZER; @@ -97,10 +98,11 @@ static void _signal_while_allocating(int signo); static void _intr_handler(int signo); #ifdef HAVE_BG -#define POLL_SLEEP 3 /* retry interval in seconds */ static int _wait_bluegene_block_ready( resource_allocation_response_msg_t *alloc); static int _blocks_dealloc(void); +#else +static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc); #endif #ifdef HAVE_CRAY_XT @@ -252,7 +254,7 @@ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) if (rc == READY_JOB_FATAL) break; /* fatal error */ - if (rc == READY_JOB_ERROR) /* error */ + if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; @@ -316,6 +318,60 @@ static int _blocks_dealloc(void) bg_info_ptr = new_bg_ptr; return rc; } +#else +/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ +static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) +{ + int is_ready = 0, i, rc; + int cur_delay = 0; + int suspend_time, resume_time, max_delay; + + suspend_time = slurm_get_suspend_timeout(); + resume_time = slurm_get_resume_timeout(); + if ((suspend_time == 0) || (resume_time == 0)) + return 1; /* Power save mode disabled */ + max_delay = suspend_time + resume_time; + max_delay *= 5; /* Allow for ResumeRate support */ + + pending_job_id = alloc->job_id; + + for (i=0; (cur_delay < max_delay); i++) { + if (i) { + if (i == 1) + verbose("Waiting for nodes to boot"); + else + debug("still waiting"); + sleep(POLL_SLEEP); + cur_delay += POLL_SLEEP; + } + + rc = slurm_job_node_ready(alloc->job_id); + + if (rc == READY_JOB_FATAL) + break; /* fatal error */ + if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) + continue; /* retry */ + if ((rc & READY_JOB_STATE) == 0) /* job killed */ + break; + if (rc & READY_NODE_STATE) { /* job and node ready */ + is_ready = 1; + break; + } + if (destroy_job) + break; + } + if (is_ready) { + if (i > 0) + verbose("Nodes %s are ready for job", alloc->node_list); + } else if (!destroy_job) + error("Nodes %s are still not ready", alloc->node_list); + else /* allocation_interrupted and slurmctld not responing */ + is_ready = 0; + + pending_job_id = 0; + + return is_ready; +} #endif /* HAVE_BG */ #ifdef HAVE_CRAY_XT @@ -408,6 +464,13 @@ allocate_nodes(void) "boot of the block."); goto relinquish; } +#else + if (!_wait_nodes_ready(resp)) { + if(!destroy_job) + error("Something is wrong with the " + "boot of the nodes."); + goto relinquish; + } #endif #ifdef HAVE_CRAY_XT if (!_claim_reservation(resp)) {