diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 5de60e60a633e5015b50ff8a72aa0913e02946e5..becd16f12270089fa6b81e9ba922933d7439f2df 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -122,7 +122,7 @@ static void _read_data_array_from_file(char *file_name, char ***data, uint16_t * size); static void _read_data_from_file(char *file_name, char **data); static void _remove_defunct_batch_dirs(List batch_dirs); -static void _reset_detail_bitmaps(struct job_record *job_ptr); +static int _reset_detail_bitmaps(struct job_record *job_ptr); static void _reset_step_bitmaps(struct job_record *job_ptr); static void _set_job_id(struct job_record *job_ptr); static void _set_job_prio(struct job_record *job_ptr); @@ -2260,19 +2260,22 @@ void reset_job_bitmaps(void) ListIterator job_record_iterator; struct job_record *job_ptr; struct part_record *part_ptr; + bool job_fail = false; if (job_list == NULL) fatal ("reset_job_bitmaps: job_list == NULL"); + job_record_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_record_iterator))) { xassert (job_ptr->magic == JOB_MAGIC); + job_fail = false; part_ptr = list_find_first(part_list, &list_find_part, - job_ptr->partition); + job_ptr->partition); if (part_ptr == NULL) { error("Invalid partition (%s) for job_id %u", job_ptr->partition, job_ptr->job_id); - job_ptr->job_state = JOB_NODE_FAIL; + job_fail = true; } job_ptr->part_ptr = part_ptr; @@ -2281,27 +2284,42 @@ void reset_job_bitmaps(void) (node_name2bitmap(job_ptr->nodes, &job_ptr->node_bitmap))) { error("Invalid nodes (%s) for job_id %u", job_ptr->nodes, job_ptr->job_id); - job_ptr->job_state = JOB_NODE_FAIL; + job_fail = true; } build_node_details(job_ptr); /* set: num_cpu_groups, * cpu_count_reps, node_cnt, * cpus_per_node, node_addr */ - _reset_detail_bitmaps(job_ptr); + if (_reset_detail_bitmaps(job_ptr)) + job_fail = true; + _reset_step_bitmaps(job_ptr); if ((job_ptr->kill_on_step_done) && (list_count(job_ptr->step_list) <= 1)) - job_ptr->job_state = JOB_NODE_FAIL; + job_fail = true; + + if (job_fail) { + if (job_ptr->job_state == JOB_PENDING) { + job_ptr->start_time = + job_ptr->end_time = time(NULL); + job_ptr->job_state = JOB_NODE_FAIL; + } else if (job_ptr->job_state == JOB_RUNNING) { + job_ptr->end_time = time(NULL); + job_ptr->job_state = JOB_NODE_FAIL | + JOB_COMPLETING; + } + delete_all_step_records(job_ptr); + } } list_iterator_destroy(job_record_iterator); last_job_update = time(NULL); } -static void _reset_detail_bitmaps(struct job_record *job_ptr) +static int _reset_detail_bitmaps(struct job_record *job_ptr) { if (job_ptr->details == NULL) - return; + return SLURM_SUCCESS; FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if ((job_ptr->details->req_nodes) && @@ -2309,7 +2327,7 @@ static void _reset_detail_bitmaps(struct job_record *job_ptr) &job_ptr->details->req_node_bitmap))) { error("Invalid req_nodes (%s) for job_id %u", job_ptr->details->req_nodes, job_ptr->job_id); - job_ptr->job_state = JOB_NODE_FAIL; + return SLURM_ERROR; } FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); @@ -2318,8 +2336,10 @@ static void _reset_detail_bitmaps(struct job_record *job_ptr) &job_ptr->details->exc_node_bitmap))) { error("Invalid exc_nodes (%s) for job_id %u", job_ptr->details->exc_nodes, job_ptr->job_id); - job_ptr->job_state = JOB_NODE_FAIL; + return SLURM_ERROR; } + + return SLURM_SUCCESS; } static void _reset_step_bitmaps(struct job_record *job_ptr) diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 4912195e9ca81607ef495156f0f80ffe1abed562..a9d6a185de408869d3f2d3788d145b6e21d1b4ba 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -56,7 +56,7 @@ static int _parse_part_spec(char *in_line); static void _set_config_defaults(slurm_ctl_conf_t * ctl_conf_ptr); static int _sync_nodes_to_comp_job(void); static int _sync_nodes_to_jobs(void); -static int _sync_nodes_to_run_job(struct job_record *job_ptr); +static int _sync_nodes_to_active_job(struct job_record *job_ptr); #ifdef HAVE_LIBELAN3 static void _validate_node_proc_count(void); #endif @@ -848,7 +848,7 @@ static int _sync_nodes_to_jobs(void) if ((job_ptr->job_state == JOB_RUNNING) || (job_ptr->job_state & JOB_COMPLETING)) - update_cnt += _sync_nodes_to_run_job(job_ptr); + update_cnt += _sync_nodes_to_active_job(job_ptr); } if (update_cnt) info("_sync_nodes_to_jobs updated state of %d nodes", @@ -880,7 +880,7 @@ static int _sync_nodes_to_comp_job(void) return update_cnt; } -static int _sync_nodes_to_run_job(struct job_record *job_ptr) +static int _sync_nodes_to_active_job(struct job_record *job_ptr) { int i, cnt = 0; uint16_t base_state, no_resp_flag; @@ -888,20 +888,29 @@ static int _sync_nodes_to_run_job(struct job_record *job_ptr) for (i = 0; i < node_record_count; i++) { if (bit_test(job_ptr->node_bitmap, i) == 0) continue; - node_record_table_ptr[i].run_job_cnt++; base_state = node_record_table_ptr[i].node_state & (~NODE_STATE_NO_RESPOND); - if (base_state == NODE_STATE_DOWN) + if (base_state == NODE_STATE_DOWN) { job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; - if ((base_state == NODE_STATE_UNKNOWN) || - (base_state == NODE_STATE_IDLE) || - (base_state == NODE_STATE_DRAINED)) { - cnt++; + job_ptr->end_time = time(NULL); + delete_all_step_records(job_ptr); + } else { + node_record_table_ptr[i].run_job_cnt++; /* NOTE: + * This counter moved to comp_job_cnt + * by _sync_nodes_to_comp_job() */ no_resp_flag = node_record_table_ptr[i].node_state & NODE_STATE_NO_RESPOND; - node_record_table_ptr[i].node_state = + if ((base_state == NODE_STATE_UNKNOWN) || + (base_state == NODE_STATE_IDLE)) { + cnt++; + node_record_table_ptr[i].node_state = NODE_STATE_ALLOCATED | no_resp_flag; - } + } else if (base_state == NODE_STATE_DRAINED) { + cnt++; + node_record_table_ptr[i].node_state = + NODE_STATE_DRAINING | no_resp_flag; + } + } } return cnt; }