diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 64d20e36e46dde97623f7c02c25fe3cdb39c7670..5a7c99638605067e0e0dd30cb8a9c5fcb55463b4 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -162,6 +162,7 @@ typedef struct mail_info { } mail_info_t; static void _sig_handler(int dummy); +static bool _batch_launch_defer(queued_request_t *queued_req_ptr); static inline int _comm_err(char *node_name); static void _list_delete_retry(void *retry_entry); static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr); @@ -1160,6 +1161,8 @@ extern int agent_retry (int min_wait, bool mail_too) retry_iter = list_iterator_create(retry_list); while ((queued_req_ptr = (queued_request_t *) list_next(retry_iter))) { + if (_batch_launch_defer(queued_req_ptr)) + continue; if (queued_req_ptr->last_attempt == 0) { list_remove(retry_iter); list_size--; @@ -1178,6 +1181,8 @@ extern int agent_retry (int min_wait, bool mail_too) /* next try to find an older record to retry */ while ((queued_req_ptr = (queued_request_t *) list_next(retry_iter))) { + if (_batch_launch_defer(queued_req_ptr)) + continue; age = difftime(now, queued_req_ptr->last_attempt); if (age > min_wait) { list_remove(retry_iter); @@ -1440,3 +1445,48 @@ extern void mail_job_info (struct job_record *job_ptr, uint16_t mail_type) return; } +/* return true if the requests is to launch a batch job and the message + * destination is not yet powered up, otherwise return false */ +static bool _batch_launch_defer(queued_request_t *queued_req_ptr) +{ + char hostname[512]; + agent_arg_t *agent_arg_ptr; + batch_job_launch_msg_t *launch_msg_ptr; + struct node_record *node_ptr; + time_t now = time(NULL); + + agent_arg_ptr = queued_req_ptr->agent_arg_ptr; + if (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH) + return false; + + launch_msg_ptr = (batch_job_launch_msg_t *)agent_arg_ptr->msg_args; + hostlist_deranged_string(agent_arg_ptr->hostlist, + sizeof(hostname), hostname); + node_ptr = find_node_record(hostname); + if (node_ptr == NULL) { + error("agent(batch_launch) could not locate node %s", + agent_arg_ptr->hostlist); + queued_req_ptr->last_attempt = (time_t) 0; + return false; /* no benefit to defer */ + } + + if (((node_ptr->node_state & NODE_STATE_POWER_SAVE) == 0) && + ((node_ptr->node_state & NODE_STATE_NO_RESPOND) == 0)) { +info("agent ready to send batch request to %s", hostname); + queued_req_ptr->last_attempt = (time_t) 0; + return false; + } + + if (queued_req_ptr->last_attempt == 0) + queued_req_ptr->last_attempt = now; + else if (difftime(now, queued_req_ptr->last_attempt) >= + BATCH_START_TIME) { + error("agent waited too long for node %s to come up, " + "sending batch request anyway..."); + queued_req_ptr->last_attempt = (time_t) 0; + return false; + } + +info("agent waiting to send batch request to %s", hostname); + return true; +} diff --git a/src/slurmctld/agent.h b/src/slurmctld/agent.h index 34cc778282f1c61f652a6997f487db0fdf646d9d..b7b98da1e45f86187c456c15a57ecbb067acd09f 100644 --- a/src/slurmctld/agent.h +++ b/src/slurmctld/agent.h @@ -1,10 +1,9 @@ /*****************************************************************************\ * agent.h - data structures and function definitions for parallel * background communications - * - * $Id$ ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. + * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette@llnl.gov>, et. al. * Derived from dsh written by Jim Garlick <garlick1@llnl.gov> @@ -47,6 +46,7 @@ #define AGENT_IS_THREAD 1 /* set if agent itself a thread of * slurmctld, 0 for function call */ #define AGENT_THREAD_COUNT 10 /* maximum active threads per agent */ +#define BATCH_START_TIME 300 /* allow batch jobs 300 secs to start */ #define COMMAND_TIMEOUT 10 /* command requeue or error, seconds */ #define MAX_AGENT_CNT (MAX_SERVER_THREADS / (AGENT_THREAD_COUNT + 2)) /* maximum simultaneous agents, note diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 9a8027a64044aaa119a89e55370a1929d9431341..beb5e126def12136e3ec0a29a59f425e1e00a0da 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -4604,7 +4604,8 @@ extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) error("Orphan job %u.%u reported on node %s", reg_msg->job_id[i], reg_msg->step_id[i], reg_msg->node_name); - abort_job_on_node(reg_msg->job_id[i], job_ptr, node_ptr); + abort_job_on_node(reg_msg->job_id[i], + job_ptr, node_ptr); } else if ((job_ptr->job_state == JOB_RUNNING) || @@ -4645,7 +4646,8 @@ extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) error("Registered PENDING job %u.%u on node %s ", reg_msg->job_id[i], reg_msg->step_id[i], reg_msg->node_name); - abort_job_on_node(reg_msg->job_id[i], job_ptr, node_ptr); + abort_job_on_node(reg_msg->job_id[i], + job_ptr, node_ptr); } else { /* else job is supposed to be done */ @@ -4675,7 +4677,7 @@ extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) } /* Purge any batch job that should have its script running on node - * node_inx, but is not (i.e. its time_last_active != now) */ + * node_inx, but is not. Allow BATCH_START_TIME secs for startup. */ static void _purge_lost_batch_jobs(int node_inx, time_t now) { ListIterator job_iterator; @@ -4685,9 +4687,9 @@ static void _purge_lost_batch_jobs(int node_inx, time_t now) while ((job_ptr = (struct job_record *) list_next(job_iterator))) { bool job_active = ((job_ptr->job_state == JOB_RUNNING) || (job_ptr->job_state == JOB_SUSPENDED)); - if ((!job_active) || - (job_ptr->batch_flag == 0) || - (job_ptr->time_last_active == now) || + if ((!job_active) || + (job_ptr->batch_flag == 0) || + ((job_ptr->time_last_active + BATCH_START_TIME) > now) || (node_inx != bit_ffs(job_ptr->node_bitmap))) continue;