diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index a475c1d0c5285a88737446620ff42f007f7325c5..c7d92a44e74f3277cc90ad6ceeb74148bed77a76 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -127,8 +127,6 @@ static void _delete_job_desc_files(uint32_t job_id); static void _dump_job_details(struct job_details *detail_ptr, Buf buffer); static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer); -static void _excise_node_from_job(struct job_record *job_ptr, - struct node_record *node_ptr); static int _find_batch_dir(void *x, void *key); static void _get_batch_job_dir_ids(List batch_dirs); static void _job_timed_out(struct job_record *job_ptr); @@ -1373,7 +1371,7 @@ extern int kill_running_job_by_node_name(char *node_name) error("Removing failed node %s from job_id %u", node_name, job_ptr->job_id); kill_step_on_node(job_ptr, node_ptr); - _excise_node_from_job(job_ptr, node_ptr); + excise_node_from_job(job_ptr, node_ptr); } else if (job_ptr->batch_flag && job_ptr->details && (job_ptr->details->requeue > 0)) { char requeue_msg[128]; @@ -1458,8 +1456,8 @@ extern int kill_running_job_by_node_name(char *node_name) } /* Remove one node from a job's allocation */ -static void _excise_node_from_job(struct job_record *job_ptr, - struct node_record *node_ptr) +extern void excise_node_from_job(struct job_record *job_ptr, + struct node_record *node_ptr) { int i, orig_pos = -1, new_pos = -1; bitstr_t *orig_bitmap = bit_copy(job_ptr->node_bitmap); diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index dedf631df872af8b5d97297f791523576e8f79db..e6f62ca57a0868f3874d65a01712a586736c1207 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -81,6 +81,7 @@ #include "src/slurmctld/reservation.h" #include "src/slurmctld/sched_plugin.h" #include "src/slurmctld/slurmctld.h" +#include "src/slurmctld/srun_comm.h" #include "src/slurmctld/trigger_mgr.h" #include "src/slurmctld/topo_plugin.h" @@ -1183,11 +1184,10 @@ static int _sync_nodes_to_active_job(struct job_record *job_ptr) uint16_t base_state, node_flags; struct node_record *node_ptr = node_record_table_ptr; - job_ptr->node_cnt = 0; + job_ptr->node_cnt = bit_set_count(job_ptr->node_bitmap); for (i = 0; i < node_record_count; i++, node_ptr++) { if (bit_test(job_ptr->node_bitmap, i) == 0) continue; - job_ptr->node_cnt++; base_state = node_ptr->node_state & NODE_STATE_BASE; node_flags = node_ptr->node_state & NODE_STATE_FLAGS; @@ -1200,8 +1200,21 @@ static int _sync_nodes_to_active_job(struct job_record *job_ptr) (job_ptr->details) && (job_ptr->details->shared == 0)) node_ptr->no_share_job_cnt++; - if (base_state == NODE_STATE_DOWN) { + if ((base_state == NODE_STATE_DOWN) && + (job_ptr->job_state == JOB_RUNNING) && + (job_ptr->kill_on_node_fail == 0) && + (job_ptr->node_cnt > 1)) { + /* This should only happen if a job was running + * on a node that was newly configured DOWN */ + info("Removing failed node %s from job_id %u", + node_ptr->name, job_ptr->job_id); + srun_node_fail(job_ptr->job_id, node_ptr->name); + kill_step_on_node(job_ptr, node_ptr); + excise_node_from_job(job_ptr, node_ptr); + } else if (base_state == NODE_STATE_DOWN) { time_t now = time(NULL); + info("Killing job %u on DOWN node %s", + job_ptr->job_id, node_ptr->name); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; job_ptr->end_time = MIN(job_ptr->end_time, now); job_ptr->exit_code = MAX(job_ptr->exit_code, 1); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 8a144f137564a16757b71f60221246318d0f26c5..ef96cef664d6b5983a0d069a9cf5b7c89fb28555 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -776,6 +776,10 @@ extern void dump_job_step_state(struct step_record *step_ptr, Buf buffer); */ extern void dump_step_desc(job_step_create_request_msg_t *step_spec); +/* Remove one node from a job's allocation */ +extern void excise_node_from_job(struct job_record *job_ptr, + struct node_record *node_ptr); + /* * find_job_record - return a pointer to the job record with the given job_id * IN job_id - requested job's id @@ -803,6 +807,16 @@ extern struct node_record *find_node_record (char *name); */ extern struct part_record *find_part_record (char *name); +/* + * find_step_record - return a pointer to the step record with the given + * job_id and step_id + * IN job_ptr - pointer to job table entry to have step record added + * IN step_id - id of the desired job step + * RET pointer to the job step's record, NULL on error + */ +extern struct step_record * find_step_record(struct job_record *job_ptr, + uint32_t step_id); + /* * get_job_env - return the environment variables and their count for a * given job @@ -825,16 +839,6 @@ extern char *get_job_script (struct job_record *job_ptr); */ extern uint32_t get_next_job_id(void); -/* - * find_step_record - return a pointer to the step record with the given - * job_id and step_id - * IN job_ptr - pointer to job table entry to have step record added - * IN step_id - id of the desired job step - * RET pointer to the job step's record, NULL on error - */ -extern struct step_record * find_step_record(struct job_record *job_ptr, - uint32_t step_id); - /* * init_job_conf - initialize the job configuration tables and values. * this should be called after creating node information, but