From edb6e6de044624d5127f68af191e07712ffa0957 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 27 Nov 2006 22:27:28 +0000 Subject: [PATCH] Modity job state "reason" field to report why a job failed (previously previously reported only reason waiting to run). Requires cold-start of slurmctld (-c option). --- NEWS | 3 +++ doc/man/man1/squeue.1 | 32 +++++++++++++++++++--- slurm/slurm.h.in | 24 ++++++++++++----- src/api/job_info.c | 3 ++- src/common/slurm_protocol_defs.c | 18 ++++++++++++- src/common/slurm_protocol_defs.h | 2 +- src/common/slurm_protocol_pack.c | 11 ++++---- src/plugins/sched/wiki2/start_job.c | 9 +++---- src/slurmctld/job_mgr.c | 41 ++++++++++++++++++++--------- src/slurmctld/job_scheduler.c | 1 + src/slurmctld/node_mgr.c | 1 + src/slurmctld/node_scheduler.c | 12 +++------ src/slurmctld/read_config.c | 1 + src/slurmctld/slurmctld.h | 4 +-- src/squeue/print.c | 8 +++--- src/sview/job_info.c | 4 +-- 16 files changed, 124 insertions(+), 50 deletions(-) diff --git a/NEWS b/NEWS index b3f0c6bb4f1..f00d2cb339b 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,9 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.2.0-pre8 ============================= + -- Modity job state "reason" field to report why a job failed (previously + previously reported only reason waiting to run). Requires cold-start of + slurmctld (-c option). * Changes in SLURM 1.2.0-pre7 ============================= diff --git a/doc/man/man1/squeue.1 b/doc/man/man1/squeue.1 index ed6d7126a82..6b6eb55faf3 100644 --- a/doc/man/man1/squeue.1 +++ b/doc/man/man1/squeue.1 @@ -1,4 +1,4 @@ -.TH SQUEUE "1" "March 2006" "squeue 1.1" "Slurm components" +.TH SQUEUE "1" "November 2006" "squeue 1.2" "Slurm components" .SH "NAME" squeue \- view information about jobs located in the SLURM scheduling queue. @@ -174,13 +174,15 @@ Priority of the job (converted to a floating point number between 0.0 and 1.0 Partition of the job or job step .TP \fB%r\fR -The reason a job is waiting for execution. +The reason a job is in its current state. See the \fBJOB REASON CODES\fR section below for more information. .TP \fB%R\fR -For running or completed jobs: the list of allocate nodes. For pending jobs: the reason a job is waiting for execution -is printed within parenthesis. +is printed within parenthesis. +For terminated jobs with failure: an explanation as to why the +job failed is printed within parenthesis. +For all other job states: the list of allocate nodes. See the \fBJOB REASON CODES\fR section below for more information. .TP \fB%s\fR @@ -308,6 +310,28 @@ One or more higher priority jobs exist for this partition. .TP \fBResources\fR The job is waiting for resources to become availble. +.TP +\fBNodeDown\fR +A node required by the job is down. +.TP +\fBBadConstraints\fR +The job's constraints can not be satisfied. +.TP +\fBSystemFailure\fR +Failure of the SLURM system, a file system, the network, etc. +.TP +\fBJobLaunchFailure\fR +The job could not be launched. +This may be due to a file system problem, invalid program name, etc. +.TP +\fBNonZeroExitCode\fR +The job terminated with a non\-zero exit code. +.TP +\fBTimeLimit\fR +The job exhausted its time limit. +.TP +\fBInactiveLimit\fR +The job reached the system InactiveLimit. .SH "JOB STATE CODES" Jobs typically pass through several states in the course of their diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index d792acc9ee9..9fa578bb8dd 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -172,9 +172,11 @@ enum job_states { #define NICE_OFFSET 10000 /* offset for job's nice value */ -/* Reason for job to be pending rather than executing. If multiple reasons - * exists, only one is given for the sake of system efficiency */ -enum job_wait_reason { +/* Reason for job to be pending rather than executing or reason for job + * failure. If multiple reasons exists, only one is given for the sake of + * system efficiency */ +enum job_state_reason { +/* Reasons for job to be pending */ WAIT_NO_REASON = 0, /* not set or job not pending */ WAIT_PRIORITY, /* higher priority jobs exist */ WAIT_DEPENDENCY, /* depedent job has not completed */ @@ -183,7 +185,17 @@ enum job_wait_reason { WAIT_PART_TIME_LIMIT, /* request exceeds partition time limit */ WAIT_PART_STATE, /* requested partition is down */ WAIT_HELD, /* job is held, priority==0 */ - WAIT_TIME /* job waiting for specific begin time */ + WAIT_TIME, /* job waiting for specific begin time */ + WAIT_TBD1, + WAIT_TBD2, + FAIL_DOWN_PARTITION, /* partition for job is DOWN */ + FAIL_DOWN_NODE, /* some node in the allocation failed */ + FAIL_BAD_CONSTRAINTS, /* constraints can not be satisfied */ + FAIL_SYSTEM, /* slurm system failure */ + FAIL_LAUNCH, /* unable to launch job */ + FAIL_EXIT_CODE, /* exit code was non-zero */ + FAIL_TIMEOUT, /* reached end of time limit */ + FAIL_INACTIVE_LIMIT /* reached slurm InactiveLimit */ }; enum job_acct_types { @@ -569,8 +581,8 @@ typedef struct job_info { uint32_t dependency; /* defer until specified job completes */ uint32_t exit_code; /* exit code for job (status from wait call) */ char *account; /* charge to specified account */ - uint16_t wait_reason; /* reason job still pending, see - * slurm.h:enum job_wait_reason */ + uint16_t state_reason; /* reason job still pending or failed, see + * slurm.h:enum job_state_reason */ char *network; /* network specification */ char *comment; /* arbitrary comment (used by Moab scheduler) */ select_jobinfo_t select_jobinfo; /* opaque data type, diff --git a/src/api/job_info.c b/src/api/job_info.c index e2074b87ab4..77ff39e890c 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -376,7 +376,8 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) snprintf(tmp_line, sizeof(tmp_line), "Dependency=%u Account=%s Reason=%s Network=%s", job_ptr->dependency, job_ptr->account, - job_reason_string(job_ptr->wait_reason), job_ptr->network); + job_reason_string(job_ptr->state_reason), + job_ptr->network); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 360df64d6a7..3d741438b9a 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -526,7 +526,7 @@ void inline slurm_free_suspend_msg(suspend_msg_t *msg) } /* Given a job's reason for waiting, return a descriptive string */ -extern char *job_reason_string(enum job_wait_reason inx) +extern char *job_reason_string(enum job_state_reason inx) { switch (inx) { case WAIT_NO_REASON: @@ -547,6 +547,22 @@ extern char *job_reason_string(enum job_wait_reason inx) return "JobHeld"; case WAIT_TIME: return "BeginTime"; + case FAIL_DOWN_PARTITION: + return "PartitionDown"; + case FAIL_DOWN_NODE: + return "NodeDown"; + case FAIL_BAD_CONSTRAINTS: + return "BadConstraints"; + case FAIL_SYSTEM: + return "SystemFailure"; + case FAIL_LAUNCH: + return "JobLaunchFailure"; + case FAIL_EXIT_CODE: + return "NonZeroExitCode"; + case FAIL_TIMEOUT: + return "TimeLimit"; + case FAIL_INACTIVE_LIMIT: + return "InactiveLimit"; default: return "?"; } diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index b39273b441b..e5a74ab97f0 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -741,7 +741,7 @@ void inline slurm_free_node_select_msg( extern int slurm_free_msg_data(slurm_msg_type_t type, void *data); extern uint32_t slurm_get_return_code(slurm_msg_type_t type, void *data); -extern char *job_reason_string(enum job_wait_reason inx); +extern char *job_reason_string(enum job_state_reason inx); extern char *job_state_string(enum job_states inx); extern char *job_state_string_compact(enum job_states inx); extern char *node_state_string(enum node_states inx); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index ea24da432f5..d7ad96ccb10 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1938,10 +1938,12 @@ _unpack_job_info_members(job_info_t * job, Buf buffer) safe_unpack32(&job->user_id, buffer); safe_unpack32(&job->group_id, buffer); - safe_unpack16(&job->job_state, buffer); - safe_unpack16(&job->batch_flag, buffer); - safe_unpack32(&job->alloc_sid, buffer); - safe_unpack32(&job->time_limit, buffer); + safe_unpack16(&job->job_state, buffer); + safe_unpack16(&job->batch_flag, buffer); + safe_unpack16(&job->state_reason, buffer); + + safe_unpack32(&job->alloc_sid, buffer); + safe_unpack32(&job->time_limit, buffer); safe_unpack_time(&job->submit_time, buffer); safe_unpack_time(&job->start_time, buffer); @@ -1992,7 +1994,6 @@ _unpack_job_info_members(job_info_t * job, Buf buffer) safe_unpack32(&job->job_min_memory, buffer); safe_unpack32(&job->job_max_memory, buffer); safe_unpack32(&job->job_min_tmp_disk, buffer); - safe_unpack16(&job->wait_reason, buffer); safe_unpackstr_xmalloc(&job->req_nodes, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&node_inx_str, &uint16_tmp, buffer); diff --git a/src/plugins/sched/wiki2/start_job.c b/src/plugins/sched/wiki2/start_job.c index 6a0d55f4b3a..00cc6f2018b 100644 --- a/src/plugins/sched/wiki2/start_job.c +++ b/src/plugins/sched/wiki2/start_job.c @@ -182,12 +182,11 @@ static int _start_job(uint32_t jobid, char *hostlist, job_ptr->priority = 0; if (job_ptr->job_state == JOB_FAILED) wait_string = "Invalid request, job aborted"; - else if (job_ptr->details) { - wait_reason = job_ptr->details->wait_reason; + else { + wait_reason = job_ptr->state_reason; wait_string = job_reason_string(wait_reason); - job_ptr->details->wait_reason = WAIT_HELD; - } else - wait_string = "Unknown"; + job_ptr->state_reason = WAIT_HELD; + } *err_code = -910 - wait_reason; snprintf(tmp_msg, sizeof(tmp_msg), "Could not start job %u: %s", diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index e24580de5f6..a0f5e6dccea 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -496,6 +496,7 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer) pack16(dump_job_ptr->alloc_resp_port, buffer); pack16(dump_job_ptr->other_port, buffer); pack16(dump_job_ptr->mail_type, buffer); + pack16(dump_job_ptr->state_reason, buffer); packstr(dump_job_ptr->alloc_resp_host, buffer); packstr(dump_job_ptr->other_host, buffer); @@ -547,7 +548,7 @@ static int _load_job_state(Buf buffer) time_t start_time, end_time, suspend_time, pre_sus_time; uint16_t job_state, next_step_id, details, batch_flag, step_flag; uint16_t kill_on_node_fail, kill_on_step_done, name_len; - uint16_t alloc_resp_port, other_port, mail_type; + uint16_t alloc_resp_port, other_port, mail_type, state_reason; char *nodes = NULL, *partition = NULL, *name = NULL; char *alloc_node = NULL, *alloc_resp_host = NULL, *other_host = NULL; char *account = NULL, *network = NULL, *mail_user = NULL; @@ -580,6 +581,7 @@ static int _load_job_state(Buf buffer) safe_unpack16(&alloc_resp_port, buffer); safe_unpack16(&other_port, buffer); safe_unpack16(&mail_type, buffer); + safe_unpack16(&state_reason, buffer); safe_unpackstr_xmalloc(&alloc_resp_host, &name_len, buffer); safe_unpackstr_xmalloc(&other_host, &name_len, buffer); @@ -647,6 +649,7 @@ static int _load_job_state(Buf buffer) (_load_job_details(job_ptr, buffer))) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_SYSTEM; job_ptr->end_time = time(NULL); goto unpack_error; } @@ -664,6 +667,7 @@ static int _load_job_state(Buf buffer) job_ptr->next_step_id = next_step_id; job_ptr->dependency = dependency; job_ptr->exit_code = exit_code; + job_ptr->state_reason = state_reason; job_ptr->num_procs = num_procs; job_ptr->time_last_active = time(NULL); strncpy(job_ptr->name, name, MAX_JOBNAME_LEN); @@ -961,6 +965,7 @@ extern int kill_job_by_part_name(char *part_name) job_ptr->job_id, part_name); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; job_ptr->exit_code = MAX(job_ptr->exit_code, 1); + job_ptr->state_reason = FAIL_DOWN_PARTITION; if (suspended) job_ptr->end_time = job_ptr->suspend_time; else @@ -1049,6 +1054,7 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) JOB_COMPLETING; job_ptr->exit_code = MAX(job_ptr->exit_code, 1); + job_ptr->state_reason = FAIL_DOWN_NODE; if (suspended) job_ptr->end_time = job_ptr->suspend_time; @@ -1327,6 +1333,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, if (immediate && job_ptr) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; job_ptr->start_time = job_ptr->end_time = time(NULL); job_completion_logger(job_ptr); } @@ -1358,6 +1365,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, if (immediate && (too_fragmented || (!top_prio) || (!independent))) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; job_ptr->start_time = job_ptr->end_time = time(NULL); job_completion_logger(job_ptr); if (!independent) @@ -1383,6 +1391,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, if (immediate) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; job_ptr->start_time = job_ptr->end_time = time(NULL); job_completion_logger(job_ptr); } else /* job remains queued */ @@ -1395,6 +1404,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, if (error_code) { /* fundamental flaw in job request */ job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; job_ptr->start_time = job_ptr->end_time = time(NULL); job_completion_logger(job_ptr); return error_code; @@ -1439,6 +1449,7 @@ extern int job_fail(uint32_t job_id) last_job_update = now; job_ptr->job_state = JOB_FAILED | JOB_COMPLETING; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_LAUNCH; deallocate_nodes(job_ptr, false, suspended); job_completion_logger(job_ptr); return SLURM_SUCCESS; @@ -1642,11 +1653,13 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, } else if (WEXITSTATUS(job_return_code)) { job_ptr->job_state = JOB_FAILED | job_comp_flag; job_ptr->exit_code = job_return_code; + job_ptr->state_reason = FAIL_EXIT_CODE; } else if (job_comp_flag && /* job was running */ (job_ptr->end_time < now)) { /* over time limit */ job_ptr->job_state = JOB_TIMEOUT | job_comp_flag; job_ptr->exit_code = MAX(job_ptr->exit_code, 1); + job_ptr->state_reason = FAIL_TIMEOUT; } else job_ptr->job_state = JOB_COMPLETE | job_comp_flag; if (suspended) @@ -1687,7 +1700,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, { int error_code = SLURM_SUCCESS, i; struct job_details *detail_ptr; - enum job_wait_reason fail_reason; + enum job_state_reason fail_reason; struct part_record *part_ptr; bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL; bool super_user = false; @@ -1893,6 +1906,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, job_ptr->job_id))) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_SYSTEM; job_ptr->start_time = job_ptr->end_time = time(NULL); error_code = ESLURM_WRITING_TO_FILE; goto cleanup; @@ -1930,8 +1944,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, if (fail_reason != WAIT_NO_REASON) { error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; job_ptr->priority = 1; /* Move to end of queue */ - if (detail_ptr) - detail_ptr->wait_reason = fail_reason; + job_ptr->state_reason = fail_reason; } jobacct_g_job_start_slurmctld(job_ptr); @@ -2514,6 +2527,7 @@ void job_time_limit(void) info("Inactivity time limit reached for JobId=%u", job_ptr->job_id); _job_timed_out(job_ptr); + job_ptr->state_reason = FAIL_INACTIVE_LIMIT; continue; } if ((job_ptr->time_limit != INFINITE) @@ -2522,6 +2536,7 @@ void job_time_limit(void) info("Time limit exhausted for JobId=%u", job_ptr->job_id); _job_timed_out(job_ptr); + job_ptr->state_reason = FAIL_TIMEOUT; continue; } @@ -2839,6 +2854,8 @@ void pack_job(struct job_record *dump_job_ptr, Buf buffer) pack16(dump_job_ptr->job_state, buffer); pack16(dump_job_ptr->batch_flag, buffer); + pack16(dump_job_ptr->state_reason, buffer); + pack32(dump_job_ptr->alloc_sid, buffer); if ((dump_job_ptr->time_limit == NO_VAL) && dump_job_ptr->part_ptr) pack32(dump_job_ptr->part_ptr->max_time, buffer); @@ -2923,12 +2940,11 @@ static void _pack_pending_job_details(struct job_details *detail_ptr, pack16(detail_ptr->shared, buffer); pack16(detail_ptr->contiguous, buffer); pack16(detail_ptr->cpus_per_task, buffer); - pack16(detail_ptr->job_min_procs, buffer); + pack32(detail_ptr->job_min_memory, buffer); pack32(detail_ptr->job_max_memory, buffer); pack32(detail_ptr->job_min_tmp_disk, buffer); - pack16(detail_ptr->wait_reason, buffer); packstr(detail_ptr->req_nodes, buffer); pack_bit_fmt(detail_ptr->req_node_bitmap, buffer); @@ -2947,7 +2963,6 @@ static void _pack_pending_job_details(struct job_details *detail_ptr, pack32((uint32_t) 0, buffer); pack32((uint32_t) 0, buffer); pack32((uint32_t) 0, buffer); - pack16((uint16_t) 0, buffer); packnull(buffer); packnull(buffer); @@ -3062,6 +3077,7 @@ void reset_job_bitmaps(void) JOB_COMPLETING; } job_ptr->exit_code = MAX(job_ptr->exit_code, 1); + job_ptr->state_reason = FAIL_DOWN_NODE; job_completion_logger(job_ptr); } } @@ -3252,9 +3268,9 @@ static bool _top_priority(struct job_record *job_ptr) if ((!top) && detail_ptr) { /* not top prio */ if (job_ptr->priority == 0) /* user/admin hold */ - detail_ptr->wait_reason = WAIT_HELD; + job_ptr->state_reason = WAIT_HELD; else if (job_ptr->priority != 1) /* not system hold */ - detail_ptr->wait_reason = WAIT_PRIORITY; + job_ptr->state_reason = WAIT_PRIORITY; } return top; #endif @@ -3845,6 +3861,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, job_id_ptr[i], step_id_ptr[i], node_name); job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_SYSTEM; last_job_update = now; job_ptr->start_time = job_ptr->end_time = now; kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); @@ -4048,6 +4065,7 @@ static void _validate_job_files(List batch_dirs) job_ptr->job_id); job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_SYSTEM; job_ptr->start_time = job_ptr->end_time = time(NULL); job_completion_logger(job_ptr); } @@ -4244,7 +4262,7 @@ extern bool job_independent(struct job_record *job_ptr) struct job_details *detail_ptr = job_ptr->details; if (detail_ptr && (detail_ptr->begin_time > time(NULL))) { - detail_ptr->wait_reason = WAIT_TIME; + job_ptr->state_reason = WAIT_TIME; return false; /* not yet time */ } @@ -4259,8 +4277,7 @@ extern bool job_independent(struct job_record *job_ptr) (dep_ptr->job_state >= JOB_COMPLETE)) return true; - if (detail_ptr) - detail_ptr->wait_reason = WAIT_DEPENDENCY; + job_ptr->state_reason = WAIT_DEPENDENCY; return false; /* job exists and incomplete */ } /* diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 6d2ba47bbc2..9fcb1b489b4 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -259,6 +259,7 @@ int schedule(void) last_job_update = time(NULL); job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; job_ptr->start_time = job_ptr->end_time = time(NULL); job_completion_logger(job_ptr); delete_job_details(job_ptr); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index e54fa2734f8..54a0d0b9318 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1304,6 +1304,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count, /* FIXME: Could possibly recover the job */ job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; + job_ptr->state_reason = FAIL_SYSTEM; last_job_update = now; job_ptr->start_time = job_ptr->end_time = now; kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 8f61b1f2bd3..f6297d8bcbe 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -956,12 +956,11 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, { int error_code = SLURM_SUCCESS, i, node_set_size = 0; bitstr_t *select_bitmap = NULL; - struct job_details *detail_ptr = job_ptr->details; struct node_set *node_set_ptr = NULL; struct part_record *part_ptr = job_ptr->part_ptr; uint32_t min_nodes, max_nodes, req_nodes; int super_user = false; - enum job_wait_reason fail_reason; + enum job_state_reason fail_reason; xassert(job_ptr); xassert(job_ptr->magic == JOB_MAGIC); @@ -994,8 +993,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, (job_ptr->details->min_nodes > part_ptr->max_nodes)) fail_reason = WAIT_PART_NODE_LIMIT; if (fail_reason != WAIT_NO_REASON) { - if (detail_ptr) - detail_ptr->wait_reason = fail_reason; + job_ptr->state_reason = fail_reason; last_job_update = time(NULL); if (job_ptr->priority == 0) /* user/admin hold */ return ESLURM_JOB_HELD; @@ -1055,8 +1053,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, } if (error_code) { - if (detail_ptr) - detail_ptr->wait_reason = WAIT_RESOURCES; + job_ptr->state_reason = WAIT_RESOURCES; if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) { /* Required nodes are down or * too many nodes requested */ @@ -1090,8 +1087,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, } /* assign the nodes and stage_in the job */ - if (detail_ptr) - detail_ptr->wait_reason = WAIT_NO_REASON; + job_ptr->state_reason = WAIT_NO_REASON; job_ptr->nodes = bitmap2node_name(select_bitmap); select_bitmap = NULL; /* nothing left to free */ allocate_nodes(job_ptr); diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 538daf7770b..b12b3c2cc13 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -990,6 +990,7 @@ static int _sync_nodes_to_active_job(struct job_record *job_ptr) job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; job_ptr->end_time = MIN(job_ptr->end_time, now); job_ptr->exit_code = MAX(job_ptr->exit_code, 1); + job_ptr->state_reason = FAIL_DOWN_NODE; job_completion_logger(job_ptr); cnt++; } else if ((base_state == NODE_STATE_UNKNOWN) || diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 12409e776a3..5be60208c7e 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -271,8 +271,6 @@ struct job_details { uint16_t contiguous; /* set if requires contiguous nodes */ uint16_t task_dist; /* task layout for this job. Only useful * when Consumable Resources is enabled */ - uint16_t wait_reason; /* reason job still pending, see - * slurm.h:enum job_wait_reason */ uint32_t num_tasks; /* number of tasks to start */ uint16_t overcommit; /* processors being over subscribed */ uint16_t cpus_per_task; /* number of processors required for @@ -374,6 +372,8 @@ struct job_record { uint32_t requid; /* requester user ID */ uint32_t exit_code; /* exit code for job (status from * wait call) */ + uint16_t state_reason; /* reason job still pending or failed + * see slurm.h:enum job_wait_reason */ }; struct step_record { diff --git a/src/squeue/print.c b/src/squeue/print.c index 3ed0857d8a8..c4baa1b9b74 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -339,7 +339,7 @@ int _print_job_reason(job_info_t * job, int width, bool right, char* suffix) else { char id[FORMAT_STRING_SIZE]; snprintf(id, FORMAT_STRING_SIZE, "%s", - job_reason_string(job->wait_reason)); + job_reason_string(job->state_reason)); _print_str(id, width, right, true); } if (suffix) @@ -557,10 +557,12 @@ int _print_job_reason_list(job_info_t * job, int width, bool right, #else _print_str("NODELIST(REASON)", width, right, false); #endif - } else if (job->job_state == JOB_PENDING) { + } else if ((job->job_state == JOB_PENDING) + || (job->job_state == JOB_TIMEOUT) + || (job->job_state == JOB_FAILED)) { char id[FORMAT_STRING_SIZE]; snprintf(id, FORMAT_STRING_SIZE, "(%s)", - job_reason_string(job->wait_reason)); + job_reason_string(job->state_reason)); _print_str(id, width, right, true); } else { #ifdef HAVE_BG diff --git a/src/sview/job_info.c b/src/sview/job_info.c index 1645f75d0f5..758f35b146c 100644 --- a/src/sview/job_info.c +++ b/src/sview/job_info.c @@ -1368,7 +1368,7 @@ static void _layout_job_record(GtkTreeView *treeview, add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, SORTID_REASON), - job_reason_string(job_ptr->wait_reason)); + job_reason_string(job_ptr->state_reason)); add_display_treestore_line(update, treestore, &iter, find_col_name(display_data_job, @@ -1661,7 +1661,7 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr, SORTID_FEATURES, job_ptr->features, -1); gtk_tree_store_set(treestore, iter, SORTID_REASON, - job_reason_string(job_ptr->wait_reason), -1); + job_reason_string(job_ptr->state_reason), -1); gtk_tree_store_set(treestore, iter, SORTID_NETWORK, job_ptr->network, -1); gtk_tree_store_set(treestore, iter, -- GitLab