diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 4ba0c2f6f73e53046751630a58cebf4d77a1f6f4..7965eea52a6e4794fd058e2aae6317a87ba7d768 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -399,6 +399,14 @@ static void *_wdog(void *args) node_not_resp(thread_ptr[i].node_name, thread_ptr[i].start_time); } + if (agent_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH) { + /* Requeue the request */ + batch_job_launch_msg_t *launch_msg_ptr = + *agent_ptr->msg_args_pptr; + uint32_t job_id = launch_msg_ptr->job_id; + info("Non-responding node, requeue JobId=%u", job_id); + job_complete(job_id, 0, true, 0); + } unlock_slurmctld(node_write_lock); #else /* Build a list of all non-responding nodes and send @@ -569,7 +577,7 @@ static void *_thread_per_node_rpc(void *args) job_id, slurm_strerror(rc)); thread_state = DSH_DONE; lock_slurmctld(job_write_lock); - job_signal(job_id, SIGKILL, 0); + job_complete(job_id, 0, false, 1); unlock_slurmctld(job_write_lock); goto cleanup; } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index a201f458a32d341e9f3779c8ede80be5e7d7b41c..ed33424f0b943d3c74c4af8fd4ddb0beb924361d 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -111,6 +111,7 @@ static int _load_job_state(Buf buffer); static int _load_step_state(struct job_record *job_ptr, Buf buffer); static void _pack_job_details(struct job_details *detail_ptr, Buf buffer); static int _purge_job_record(uint32_t job_id); +static void _purge_lost_batch_jobs(int node_inx, time_t now); static void _read_data_array_from_file(char *file_name, char ***data, uint16_t * size); static void _read_data_from_file(char *file_name, char **data); @@ -2714,6 +2715,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, int i, node_inx, jobs_on_node; struct node_record *node_ptr; struct job_record *job_ptr; + time_t now = time(NULL); node_ptr = find_node_record(node_name); if (node_ptr == NULL) { @@ -2738,6 +2740,9 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, debug3("Registered job %u.%u on node %s ", job_id_ptr[i], step_id_ptr[i], node_name); + if ((job_ptr->batch_flag) && + (node_inx == bit_ffs(job_ptr->node_bitmap))) + job_ptr->time_last_active = now; } else { error ("Registered job %u.u on wrong node %s ", @@ -2770,12 +2775,15 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, } jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt; + if (jobs_on_node) + _purge_lost_batch_jobs(node_inx, now); if (jobs_on_node != *job_count) { /* slurmd will not know of a job unless the job has * steps active at registration time, so this is not - * an error condition */ - info("resetting job_count on node %s from %d to %d", + * an error condition, slurmd is also reporting steps + * rather than jobs */ + debug3("resetting job_count on node %s from %d to %d", node_name, *job_count, jobs_on_node); *job_count = jobs_on_node; } @@ -2783,6 +2791,29 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, return; } +/* Purge any batch job that should have its script running on node + * node_inx, but is not (i.e. its time_last_active != now) */ +static void _purge_lost_batch_jobs(int node_inx, time_t now) +{ + ListIterator job_record_iterator; + struct job_record *job_ptr; + + job_record_iterator = list_iterator_create(job_list); + while ((job_ptr = + (struct job_record *) list_next(job_record_iterator))) { + if ((job_ptr->job_state != JOB_RUNNING) || + (job_ptr->batch_flag == 0) || + (job_ptr->time_last_active == now) || + (node_inx != bit_ffs(job_ptr->node_bitmap))) + continue; + + info("Master node lost JobId=%u, killing it", + job_ptr->job_id); + job_complete(job_ptr->job_id, 0, false, 0); + } + list_iterator_destroy(job_record_iterator); +} + /* _kill_job_on_node - Kill the specific job_id on a specific node */ static void _kill_job_on_node(uint32_t job_id, struct node_record *node_ptr) diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 267ace7319c2bc6f54a225f4103fc5ad94b81f19..ebd0e536838af46c801798a2ab5e6b9963f339d2 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -76,9 +76,9 @@ static int _build_job_queue(struct job_queue **job_queue) while ((job_record_point = (struct job_record *) list_next(job_record_iterator))) { - if (job_record_point->job_state != JOB_PENDING) - continue; - if (job_record_point->priority == 0) /* held */ + if ((job_record_point->job_state != JOB_PENDING) || + (job_record_point->job_state & JOB_COMPLETING) || + (job_record_point->priority == 0)) /* held */ continue; xassert (job_record_point->magic == JOB_MAGIC); if (job_buffer_size <= job_queue_size) { @@ -154,12 +154,13 @@ int schedule(void) } else if (error_code == SLURM_SUCCESS) { /* job initiated */ last_job_update = time(NULL); - info("schedule: job_id %u on nodes %s", + info("schedule: JobId=%u NodeList=%s", job_ptr->job_id, job_ptr->nodes); - _launch_job(job_ptr); + if (job_ptr->batch_flag) + _launch_job(job_ptr); job_cnt++; } else { - info("schedule: job_id %u non-runnable, error %m", + info("schedule: JobId=%u non-runnable: %m", job_ptr->job_id); last_job_update = time(NULL); job_ptr->job_state = JOB_FAILED; @@ -219,9 +220,6 @@ static void _launch_job(struct job_record *job_ptr) pthread_t thread_agent; int retries = 0; - if (job_ptr->batch_flag == 0) - return; - node_ptr = find_first_node_record(job_ptr->node_bitmap); if (node_ptr == NULL) return; @@ -245,7 +243,7 @@ static void _launch_job(struct job_record *job_ptr) agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t)); agent_arg_ptr->node_count = 1; - agent_arg_ptr->retry = 1; + agent_arg_ptr->retry = 0; agent_arg_ptr->slurm_addr = xmalloc(sizeof(struct sockaddr_in)); memcpy(agent_arg_ptr->slurm_addr, &(node_ptr->slurm_addr), sizeof(struct sockaddr_in)); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 08ee4ea0b8db38ef6b4bbd35cc974a4f8befd2bf..d6d9c23c9d849fcc87982be5f7e84218a87d65b3 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1028,14 +1028,14 @@ void build_node_details(struct job_record *job_ptr) job_ptr->cpu_count_reps[cpu_inx]++; } else { - error("Invalid node %s in job_id %u", + error("Invalid node %s in JobId=%u", this_node_name, job_ptr->job_id); } free(this_node_name); } hostlist_destroy(host_list); if (job_ptr->node_cnt != node_inx) { - error("Node count mismatch for job_id %u (%u,%u)", + error("Node count mismatch for JobId=%u (%u,%u)", job_ptr->job_id, job_ptr->node_cnt, node_inx); job_ptr->node_cnt = node_inx; }