From edb6e6de044624d5127f68af191e07712ffa0957 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 27 Nov 2006 22:27:28 +0000
Subject: [PATCH] Modity job state "reason" field to report why a job failed
 (previously     previously reported only reason waiting to run). Requires
 cold-start of     slurmctld (-c option).

---
 NEWS                                |  3 +++
 doc/man/man1/squeue.1               | 32 +++++++++++++++++++---
 slurm/slurm.h.in                    | 24 ++++++++++++-----
 src/api/job_info.c                  |  3 ++-
 src/common/slurm_protocol_defs.c    | 18 ++++++++++++-
 src/common/slurm_protocol_defs.h    |  2 +-
 src/common/slurm_protocol_pack.c    | 11 ++++----
 src/plugins/sched/wiki2/start_job.c |  9 +++----
 src/slurmctld/job_mgr.c             | 41 ++++++++++++++++++++---------
 src/slurmctld/job_scheduler.c       |  1 +
 src/slurmctld/node_mgr.c            |  1 +
 src/slurmctld/node_scheduler.c      | 12 +++------
 src/slurmctld/read_config.c         |  1 +
 src/slurmctld/slurmctld.h           |  4 +--
 src/squeue/print.c                  |  8 +++---
 src/sview/job_info.c                |  4 +--
 16 files changed, 124 insertions(+), 50 deletions(-)

diff --git a/NEWS b/NEWS
index b3f0c6bb4f1..f00d2cb339b 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,9 @@ documents those changes that are of interest to users and admins.
 
 * Changes in SLURM 1.2.0-pre8
 =============================
+ -- Modity job state "reason" field to report why a job failed (previously 
+    previously reported only reason waiting to run). Requires cold-start of 
+    slurmctld (-c option).
 
 * Changes in SLURM 1.2.0-pre7
 =============================
diff --git a/doc/man/man1/squeue.1 b/doc/man/man1/squeue.1
index ed6d7126a82..6b6eb55faf3 100644
--- a/doc/man/man1/squeue.1
+++ b/doc/man/man1/squeue.1
@@ -1,4 +1,4 @@
-.TH SQUEUE "1" "March 2006" "squeue 1.1" "Slurm components"
+.TH SQUEUE "1" "November 2006" "squeue 1.2" "Slurm components"
 
 .SH "NAME"
 squeue \- view information about jobs located in the SLURM scheduling queue.
@@ -174,13 +174,15 @@ Priority of the job (converted to a floating point number between 0.0 and 1.0
 Partition of the job or job step
 .TP
 \fB%r\fR
-The reason a job is waiting for execution.
+The reason a job is in its current state.
 See the \fBJOB REASON CODES\fR section below for more information.
 .TP
 \fB%R\fR
-For running or completed jobs: the list of allocate nodes.
 For pending jobs: the reason a job is waiting for execution 
-is printed within parenthesis. 
+is printed within parenthesis.
+For terminated jobs with failure: an explanation as to why the 
+job failed is printed within parenthesis.
+For all other job states: the list of allocate nodes. 
 See the \fBJOB REASON CODES\fR section below for more information.
 .TP
 \fB%s\fR 
@@ -308,6 +310,28 @@ One or more higher priority jobs exist for this partition.
 .TP
 \fBResources\fR
 The job is waiting for resources to become availble.
+.TP
+\fBNodeDown\fR
+A node required by the job is down.
+.TP
+\fBBadConstraints\fR
+The job's constraints can not be satisfied.
+.TP
+\fBSystemFailure\fR
+Failure of the SLURM system, a file system, the network, etc.
+.TP
+\fBJobLaunchFailure\fR
+The job could not be launched. 
+This may be due to a file system problem, invalid program name, etc.
+.TP
+\fBNonZeroExitCode\fR
+The job terminated with a non\-zero exit code.
+.TP
+\fBTimeLimit\fR
+The job exhausted its time limit.
+.TP
+\fBInactiveLimit\fR
+The job reached the system InactiveLimit.
 
 .SH "JOB STATE CODES"
 Jobs typically pass through several states in the course of their 
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index d792acc9ee9..9fa578bb8dd 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -172,9 +172,11 @@ enum job_states {
 
 #define NICE_OFFSET 10000	/* offset for job's nice value */
 
-/* Reason for job to be pending rather than executing. If multiple reasons 
- * exists, only one is given for the sake of system efficiency */
-enum job_wait_reason {
+/* Reason for job to be pending rather than executing or reason for job 
+ * failure. If multiple reasons exists, only one is given for the sake of 
+ * system efficiency */
+enum job_state_reason {
+/* Reasons for job to be pending */
 	WAIT_NO_REASON = 0,	/* not set or job not pending */
 	WAIT_PRIORITY,		/* higher priority jobs exist */
 	WAIT_DEPENDENCY,	/* depedent job has not completed */
@@ -183,7 +185,17 @@ enum job_wait_reason {
 	WAIT_PART_TIME_LIMIT,	/* request exceeds partition time limit */
 	WAIT_PART_STATE,	/* requested partition is down */
 	WAIT_HELD,		/* job is held, priority==0 */
-	WAIT_TIME		/* job waiting for specific begin time */
+	WAIT_TIME,		/* job waiting for specific begin time */
+	WAIT_TBD1,
+	WAIT_TBD2,
+	FAIL_DOWN_PARTITION,	/* partition for job is DOWN */
+	FAIL_DOWN_NODE,		/* some node in the allocation failed */
+	FAIL_BAD_CONSTRAINTS,	/* constraints can not be satisfied */
+	FAIL_SYSTEM,		/* slurm system failure */
+	FAIL_LAUNCH,		/* unable to launch job */
+	FAIL_EXIT_CODE,		/* exit code was non-zero */
+	FAIL_TIMEOUT,		/* reached end of time limit */
+	FAIL_INACTIVE_LIMIT	/* reached slurm InactiveLimit */
 };
 
 enum job_acct_types {
@@ -569,8 +581,8 @@ typedef struct job_info {
 	uint32_t dependency;	/* defer until specified job completes */
 	uint32_t exit_code;	/* exit code for job (status from wait call) */
 	char *account;		/* charge to specified account */
-	uint16_t wait_reason;	/* reason job still pending, see
-				 * slurm.h:enum job_wait_reason */
+	uint16_t state_reason;	/* reason job still pending or failed, see
+				 * slurm.h:enum job_state_reason */
 	char *network;		/* network specification */
 	char *comment;		/* arbitrary comment (used by Moab scheduler) */
 	select_jobinfo_t select_jobinfo; /* opaque data type,
diff --git a/src/api/job_info.c b/src/api/job_info.c
index e2074b87ab4..77ff39e890c 100644
--- a/src/api/job_info.c
+++ b/src/api/job_info.c
@@ -376,7 +376,8 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
 	snprintf(tmp_line, sizeof(tmp_line), 
 		"Dependency=%u Account=%s Reason=%s Network=%s",
 		job_ptr->dependency, job_ptr->account,
-		job_reason_string(job_ptr->wait_reason), job_ptr->network);
+		job_reason_string(job_ptr->state_reason), 
+		job_ptr->network);
 	xstrcat(out, tmp_line);
 	if (one_liner)
 		xstrcat(out, " ");
diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c
index 360df64d6a7..3d741438b9a 100644
--- a/src/common/slurm_protocol_defs.c
+++ b/src/common/slurm_protocol_defs.c
@@ -526,7 +526,7 @@ void inline slurm_free_suspend_msg(suspend_msg_t *msg)
 }
 
 /* Given a job's reason for waiting, return a descriptive string */
-extern char *job_reason_string(enum job_wait_reason inx)
+extern char *job_reason_string(enum job_state_reason inx)
 {
 	switch (inx) {
 		case WAIT_NO_REASON:
@@ -547,6 +547,22 @@ extern char *job_reason_string(enum job_wait_reason inx)
 			return "JobHeld";
 		case WAIT_TIME:
 			return "BeginTime";
+		case FAIL_DOWN_PARTITION:
+			return "PartitionDown";
+		case FAIL_DOWN_NODE:
+			return "NodeDown";
+		case FAIL_BAD_CONSTRAINTS:
+			return "BadConstraints";
+		case FAIL_SYSTEM:
+			return "SystemFailure";
+		case FAIL_LAUNCH:
+			return "JobLaunchFailure";
+		case FAIL_EXIT_CODE:
+			return "NonZeroExitCode";
+		case FAIL_TIMEOUT:
+			return "TimeLimit";
+		case FAIL_INACTIVE_LIMIT:
+			return "InactiveLimit";
 		default:
 			return "?";
 	}
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index b39273b441b..e5a74ab97f0 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -741,7 +741,7 @@ void inline slurm_free_node_select_msg(
 extern int slurm_free_msg_data(slurm_msg_type_t type, void *data);
 extern uint32_t slurm_get_return_code(slurm_msg_type_t type, void *data);
 
-extern char *job_reason_string(enum job_wait_reason inx);
+extern char *job_reason_string(enum job_state_reason inx);
 extern char *job_state_string(enum job_states inx);
 extern char *job_state_string_compact(enum job_states inx);
 extern char *node_state_string(enum node_states inx);
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index ea24da432f5..d7ad96ccb10 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -1938,10 +1938,12 @@ _unpack_job_info_members(job_info_t * job, Buf buffer)
 	safe_unpack32(&job->user_id, buffer);
 	safe_unpack32(&job->group_id, buffer);
 
-	safe_unpack16(&job->job_state,  buffer);
-	safe_unpack16(&job->batch_flag, buffer);
-	safe_unpack32(&job->alloc_sid,  buffer);
-	safe_unpack32(&job->time_limit, buffer);
+	safe_unpack16(&job->job_state,    buffer);
+	safe_unpack16(&job->batch_flag,   buffer);
+	safe_unpack16(&job->state_reason, buffer);
+
+	safe_unpack32(&job->alloc_sid,    buffer);
+	safe_unpack32(&job->time_limit,   buffer);
 
 	safe_unpack_time(&job->submit_time, buffer);
 	safe_unpack_time(&job->start_time, buffer);
@@ -1992,7 +1994,6 @@ _unpack_job_info_members(job_info_t * job, Buf buffer)
 	safe_unpack32(&job->job_min_memory, buffer);
 	safe_unpack32(&job->job_max_memory, buffer);
 	safe_unpack32(&job->job_min_tmp_disk, buffer);
-	safe_unpack16(&job->wait_reason, buffer);
 
 	safe_unpackstr_xmalloc(&job->req_nodes, &uint16_tmp, buffer);
 	safe_unpackstr_xmalloc(&node_inx_str, &uint16_tmp, buffer);
diff --git a/src/plugins/sched/wiki2/start_job.c b/src/plugins/sched/wiki2/start_job.c
index 6a0d55f4b3a..00cc6f2018b 100644
--- a/src/plugins/sched/wiki2/start_job.c
+++ b/src/plugins/sched/wiki2/start_job.c
@@ -182,12 +182,11 @@ static int	_start_job(uint32_t jobid, char *hostlist,
 			job_ptr->priority = 0;
 			if (job_ptr->job_state == JOB_FAILED)
 				wait_string = "Invalid request, job aborted";
-			else if (job_ptr->details) {
-				wait_reason = job_ptr->details->wait_reason;
+			else {
+				wait_reason = job_ptr->state_reason;
 				wait_string = job_reason_string(wait_reason);
-				job_ptr->details->wait_reason = WAIT_HELD;
-			} else
-				wait_string = "Unknown";
+				job_ptr->state_reason = WAIT_HELD;
+			}
 			*err_code = -910 - wait_reason;
 			snprintf(tmp_msg, sizeof(tmp_msg),
 				"Could not start job %u: %s",
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index e24580de5f6..a0f5e6dccea 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -496,6 +496,7 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer)
 	pack16(dump_job_ptr->alloc_resp_port, buffer);
 	pack16(dump_job_ptr->other_port, buffer);
 	pack16(dump_job_ptr->mail_type, buffer);
+	pack16(dump_job_ptr->state_reason, buffer);
 
 	packstr(dump_job_ptr->alloc_resp_host, buffer);
 	packstr(dump_job_ptr->other_host, buffer);
@@ -547,7 +548,7 @@ static int _load_job_state(Buf buffer)
 	time_t start_time, end_time, suspend_time, pre_sus_time;
 	uint16_t job_state, next_step_id, details, batch_flag, step_flag;
 	uint16_t kill_on_node_fail, kill_on_step_done, name_len;
-	uint16_t alloc_resp_port, other_port, mail_type;
+	uint16_t alloc_resp_port, other_port, mail_type, state_reason;
 	char *nodes = NULL, *partition = NULL, *name = NULL;
 	char *alloc_node = NULL, *alloc_resp_host = NULL, *other_host = NULL;
 	char *account = NULL, *network = NULL, *mail_user = NULL;
@@ -580,6 +581,7 @@ static int _load_job_state(Buf buffer)
 	safe_unpack16(&alloc_resp_port, buffer);
 	safe_unpack16(&other_port, buffer);
 	safe_unpack16(&mail_type, buffer);
+	safe_unpack16(&state_reason, buffer);
 
 	safe_unpackstr_xmalloc(&alloc_resp_host, &name_len, buffer);
 	safe_unpackstr_xmalloc(&other_host, &name_len, buffer);
@@ -647,6 +649,7 @@ static int _load_job_state(Buf buffer)
 	    (_load_job_details(job_ptr, buffer))) {
 		job_ptr->job_state = JOB_FAILED;
 		job_ptr->exit_code = 1;
+		job_ptr->state_reason = FAIL_SYSTEM;
 		job_ptr->end_time = time(NULL);
 		goto unpack_error;
 	}
@@ -664,6 +667,7 @@ static int _load_job_state(Buf buffer)
 	job_ptr->next_step_id = next_step_id;
 	job_ptr->dependency   = dependency;
 	job_ptr->exit_code    = exit_code;
+	job_ptr->state_reason = state_reason;
 	job_ptr->num_procs    = num_procs;
 	job_ptr->time_last_active = time(NULL);
 	strncpy(job_ptr->name, name, MAX_JOBNAME_LEN);
@@ -961,6 +965,7 @@ extern int kill_job_by_part_name(char *part_name)
 			     job_ptr->job_id, part_name);
 			job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING;
 			job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
+			job_ptr->state_reason = FAIL_DOWN_PARTITION;
 			if (suspended)
 				job_ptr->end_time = job_ptr->suspend_time;
 			else
@@ -1049,6 +1054,7 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test)
 					JOB_COMPLETING;
 				job_ptr->exit_code = 
 					MAX(job_ptr->exit_code, 1);
+				job_ptr->state_reason = FAIL_DOWN_NODE;
 				if (suspended)
 					job_ptr->end_time =
 						job_ptr->suspend_time;
@@ -1327,6 +1333,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 		if (immediate && job_ptr) {
 			job_ptr->job_state = JOB_FAILED;
 			job_ptr->exit_code = 1;
+			job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 			job_ptr->start_time = job_ptr->end_time = time(NULL);
 			job_completion_logger(job_ptr);
 		}
@@ -1358,6 +1365,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 	if (immediate && (too_fragmented || (!top_prio) || (!independent))) {
 		job_ptr->job_state  = JOB_FAILED;
 		job_ptr->exit_code  = 1;
+		job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 		job_ptr->start_time = job_ptr->end_time = time(NULL);
 		job_completion_logger(job_ptr);
 		if (!independent)
@@ -1383,6 +1391,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 		if (immediate) {
 			job_ptr->job_state  = JOB_FAILED;
 			job_ptr->exit_code  = 1;
+			job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 			job_ptr->start_time = job_ptr->end_time = time(NULL);
 			job_completion_logger(job_ptr);
 		} else		/* job remains queued */
@@ -1395,6 +1404,7 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 	if (error_code) {	/* fundamental flaw in job request */
 		job_ptr->job_state  = JOB_FAILED;
 		job_ptr->exit_code  = 1;
+		job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 		job_ptr->start_time = job_ptr->end_time = time(NULL);
 		job_completion_logger(job_ptr);
 		return error_code;
@@ -1439,6 +1449,7 @@ extern int job_fail(uint32_t job_id)
 		last_job_update                 = now;
 		job_ptr->job_state = JOB_FAILED | JOB_COMPLETING;
 		job_ptr->exit_code = 1;
+		job_ptr->state_reason = FAIL_LAUNCH;
 		deallocate_nodes(job_ptr, false, suspended);
 		job_completion_logger(job_ptr);
 		return SLURM_SUCCESS;
@@ -1642,11 +1653,13 @@ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue,
 		} else if (WEXITSTATUS(job_return_code)) {
 			job_ptr->job_state = JOB_FAILED   | job_comp_flag;
 			job_ptr->exit_code = job_return_code;
+			job_ptr->state_reason = FAIL_EXIT_CODE;
 		}
 		else if (job_comp_flag &&		/* job was running */
 			 (job_ptr->end_time < now)) {	/* over time limit */
 			job_ptr->job_state = JOB_TIMEOUT  | job_comp_flag;
 			job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
+			job_ptr->state_reason = FAIL_TIMEOUT;
 		} else
 			job_ptr->job_state = JOB_COMPLETE | job_comp_flag;
 		if (suspended)
@@ -1687,7 +1700,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
 {
 	int error_code = SLURM_SUCCESS, i;
 	struct job_details *detail_ptr;
-	enum job_wait_reason fail_reason;
+	enum job_state_reason fail_reason;
 	struct part_record *part_ptr;
 	bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL;
 	bool super_user = false;
@@ -1893,6 +1906,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
 							 job_ptr->job_id))) {
 			job_ptr->job_state = JOB_FAILED;
 			job_ptr->exit_code = 1;
+			job_ptr->state_reason = FAIL_SYSTEM;
 			job_ptr->start_time = job_ptr->end_time = time(NULL);
 			error_code = ESLURM_WRITING_TO_FILE;
 			goto cleanup;
@@ -1930,8 +1944,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
 	if (fail_reason != WAIT_NO_REASON) {
 		error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
 		job_ptr->priority = 1;      /* Move to end of queue */
-		if (detail_ptr)
-			detail_ptr->wait_reason = fail_reason;
+		job_ptr->state_reason = fail_reason;
 	}
 	jobacct_g_job_start_slurmctld(job_ptr);
 	
@@ -2514,6 +2527,7 @@ void job_time_limit(void)
 			info("Inactivity time limit reached for JobId=%u",
 			     job_ptr->job_id);
 			_job_timed_out(job_ptr);
+			job_ptr->state_reason = FAIL_INACTIVE_LIMIT;
 			continue;
 		}
 		if ((job_ptr->time_limit != INFINITE)
@@ -2522,6 +2536,7 @@ void job_time_limit(void)
 			info("Time limit exhausted for JobId=%u",
 			     job_ptr->job_id);
 			_job_timed_out(job_ptr);
+			job_ptr->state_reason = FAIL_TIMEOUT;
 			continue;
 		}
 
@@ -2839,6 +2854,8 @@ void pack_job(struct job_record *dump_job_ptr, Buf buffer)
 
 	pack16(dump_job_ptr->job_state, buffer);
 	pack16(dump_job_ptr->batch_flag, buffer);
+	pack16(dump_job_ptr->state_reason, buffer);
+
 	pack32(dump_job_ptr->alloc_sid, buffer);
 	if ((dump_job_ptr->time_limit == NO_VAL) && dump_job_ptr->part_ptr)
 		pack32(dump_job_ptr->part_ptr->max_time, buffer);
@@ -2923,12 +2940,11 @@ static void _pack_pending_job_details(struct job_details *detail_ptr,
 		pack16(detail_ptr->shared, buffer);
 		pack16(detail_ptr->contiguous, buffer);
 		pack16(detail_ptr->cpus_per_task, buffer);
-
 		pack16(detail_ptr->job_min_procs, buffer);
+
 		pack32(detail_ptr->job_min_memory, buffer);
 		pack32(detail_ptr->job_max_memory, buffer);
 		pack32(detail_ptr->job_min_tmp_disk, buffer);
-		pack16(detail_ptr->wait_reason, buffer);
 
 		packstr(detail_ptr->req_nodes, buffer);
 		pack_bit_fmt(detail_ptr->req_node_bitmap, buffer);
@@ -2947,7 +2963,6 @@ static void _pack_pending_job_details(struct job_details *detail_ptr,
 		pack32((uint32_t) 0, buffer);
 		pack32((uint32_t) 0, buffer);
 		pack32((uint32_t) 0, buffer);
-		pack16((uint16_t) 0, buffer);
 
 		packnull(buffer);
 		packnull(buffer);
@@ -3062,6 +3077,7 @@ void reset_job_bitmaps(void)
 					JOB_COMPLETING;
 			}
 			job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
+			job_ptr->state_reason = FAIL_DOWN_NODE;
 			job_completion_logger(job_ptr);
 		}
 	}
@@ -3252,9 +3268,9 @@ static bool _top_priority(struct job_record *job_ptr)
 
 	if ((!top) && detail_ptr) {	/* not top prio */
 		if (job_ptr->priority == 0)		/* user/admin hold */
-			detail_ptr->wait_reason = WAIT_HELD;
+			job_ptr->state_reason = WAIT_HELD;
 		else if (job_ptr->priority != 1)	/* not system hold */
-			detail_ptr->wait_reason = WAIT_PRIORITY;
+			job_ptr->state_reason = WAIT_PRIORITY;
 	}
 	return top;
 #endif
@@ -3845,6 +3861,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 			      job_id_ptr[i], step_id_ptr[i], node_name);
 			job_ptr->job_state = JOB_FAILED;
 			job_ptr->exit_code = 1;
+			job_ptr->state_reason = FAIL_SYSTEM;
 			last_job_update    = now;
 			job_ptr->start_time = job_ptr->end_time  = now;
 			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
@@ -4048,6 +4065,7 @@ static void _validate_job_files(List batch_dirs)
 			      job_ptr->job_id);
 			job_ptr->job_state = JOB_FAILED;
 			job_ptr->exit_code = 1;
+			job_ptr->state_reason = FAIL_SYSTEM;
 			job_ptr->start_time = job_ptr->end_time = time(NULL);
 			job_completion_logger(job_ptr);
 		}
@@ -4244,7 +4262,7 @@ extern bool job_independent(struct job_record *job_ptr)
 	struct job_details *detail_ptr = job_ptr->details;
 
 	if (detail_ptr && (detail_ptr->begin_time > time(NULL))) {
-		detail_ptr->wait_reason = WAIT_TIME;
+		job_ptr->state_reason = WAIT_TIME;
 		return false;	/* not yet time */
 	}
 		
@@ -4259,8 +4277,7 @@ extern bool job_independent(struct job_record *job_ptr)
 	    (dep_ptr->job_state >= JOB_COMPLETE))
 		return true;
 
-	if (detail_ptr)
-		detail_ptr->wait_reason = WAIT_DEPENDENCY;
+	job_ptr->state_reason = WAIT_DEPENDENCY;
 	return false;	/* job exists and incomplete */
 }
 /*
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index 6d2ba47bbc2..9fcb1b489b4 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -259,6 +259,7 @@ int schedule(void)
 			last_job_update = time(NULL);
 			job_ptr->job_state = JOB_FAILED;
 			job_ptr->exit_code = 1;
+			job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 			job_ptr->start_time = job_ptr->end_time = time(NULL);
 			job_completion_logger(job_ptr);
 			delete_job_details(job_ptr);
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index e54fa2734f8..54a0d0b9318 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -1304,6 +1304,7 @@ extern int validate_nodes_via_front_end(uint32_t job_count,
 			/* FIXME: Could possibly recover the job */
 			job_ptr->job_state = JOB_FAILED;
 			job_ptr->exit_code = 1;
+			job_ptr->state_reason = FAIL_SYSTEM;
 			last_job_update    = now;
 			job_ptr->start_time = job_ptr->end_time  = now;
 			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 8f61b1f2bd3..f6297d8bcbe 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -956,12 +956,11 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only,
 {
 	int error_code = SLURM_SUCCESS, i, node_set_size = 0;
 	bitstr_t *select_bitmap = NULL;
-	struct job_details *detail_ptr = job_ptr->details;
 	struct node_set *node_set_ptr = NULL;
 	struct part_record *part_ptr = job_ptr->part_ptr;
 	uint32_t min_nodes, max_nodes, req_nodes;
 	int super_user = false;
-	enum job_wait_reason fail_reason;
+	enum job_state_reason fail_reason;
 
 	xassert(job_ptr);
 	xassert(job_ptr->magic == JOB_MAGIC);
@@ -994,8 +993,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only,
 	         (job_ptr->details->min_nodes > part_ptr->max_nodes))
 		 fail_reason = WAIT_PART_NODE_LIMIT;
 	if (fail_reason != WAIT_NO_REASON) {
-		if (detail_ptr)
-			detail_ptr->wait_reason = fail_reason;
+		job_ptr->state_reason = fail_reason;
 		last_job_update = time(NULL);
 		if (job_ptr->priority == 0)	/* user/admin hold */
 			return ESLURM_JOB_HELD;
@@ -1055,8 +1053,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only,
 	}
 
 	if (error_code) {
-		if (detail_ptr)
-			detail_ptr->wait_reason = WAIT_RESOURCES;
+		job_ptr->state_reason = WAIT_RESOURCES;
 		if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
 			/* Required nodes are down or 
 			 * too many nodes requested */
@@ -1090,8 +1087,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only,
 	}
 
 	/* assign the nodes and stage_in the job */
-	if (detail_ptr)
-		detail_ptr->wait_reason = WAIT_NO_REASON;
+	job_ptr->state_reason = WAIT_NO_REASON;
 	job_ptr->nodes = bitmap2node_name(select_bitmap);
 	select_bitmap = NULL;	/* nothing left to free */
 	allocate_nodes(job_ptr);
diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c
index 538daf7770b..b12b3c2cc13 100644
--- a/src/slurmctld/read_config.c
+++ b/src/slurmctld/read_config.c
@@ -990,6 +990,7 @@ static int _sync_nodes_to_active_job(struct job_record *job_ptr)
 			job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING;
 			job_ptr->end_time = MIN(job_ptr->end_time, now);
 			job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
+			job_ptr->state_reason = FAIL_DOWN_NODE;
 			job_completion_logger(job_ptr);
 			cnt++;
 		} else if ((base_state == NODE_STATE_UNKNOWN) || 
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 12409e776a3..5be60208c7e 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -271,8 +271,6 @@ struct job_details {
 	uint16_t contiguous;		/* set if requires contiguous nodes */
 	uint16_t task_dist;		/* task layout for this job. Only useful
                                          * when Consumable Resources is enabled */
-	uint16_t wait_reason;		/* reason job still pending, see
-					 * slurm.h:enum job_wait_reason */
 	uint32_t num_tasks;		/* number of tasks to start */
 	uint16_t overcommit;		/* processors being over subscribed */
 	uint16_t cpus_per_task;		/* number of processors required for 
@@ -374,6 +372,8 @@ struct job_record {
 	uint32_t requid;            	/* requester user ID */
 	uint32_t exit_code;		/* exit code for job (status from 
 					 * wait call) */
+	uint16_t state_reason;		/* reason job still pending or failed
+					 * see slurm.h:enum job_wait_reason */
 };
 
 struct 	step_record {
diff --git a/src/squeue/print.c b/src/squeue/print.c
index 3ed0857d8a8..c4baa1b9b74 100644
--- a/src/squeue/print.c
+++ b/src/squeue/print.c
@@ -339,7 +339,7 @@ int _print_job_reason(job_info_t * job, int width, bool right, char* suffix)
 	else {
 		char id[FORMAT_STRING_SIZE];
 		snprintf(id, FORMAT_STRING_SIZE, "%s", 
-			job_reason_string(job->wait_reason));
+			job_reason_string(job->state_reason));
 		_print_str(id, width, right, true);
 	}
 	if (suffix)
@@ -557,10 +557,12 @@ int _print_job_reason_list(job_info_t * job, int width, bool right,
 #else
 		_print_str("NODELIST(REASON)", width, right, false);
 #endif
-	} else if (job->job_state == JOB_PENDING) {
+	} else if ((job->job_state == JOB_PENDING)
+	||         (job->job_state == JOB_TIMEOUT)
+	||         (job->job_state == JOB_FAILED)) {
 		char id[FORMAT_STRING_SIZE];
 		snprintf(id, FORMAT_STRING_SIZE, "(%s)", 
-			job_reason_string(job->wait_reason));
+			job_reason_string(job->state_reason));
 		_print_str(id, width, right, true);
 	} else {
 #ifdef HAVE_BG
diff --git a/src/sview/job_info.c b/src/sview/job_info.c
index 1645f75d0f5..758f35b146c 100644
--- a/src/sview/job_info.c
+++ b/src/sview/job_info.c
@@ -1368,7 +1368,7 @@ static void _layout_job_record(GtkTreeView *treeview,
 	add_display_treestore_line(update, treestore, &iter, 
 				   find_col_name(display_data_job,
 						 SORTID_REASON),
-				   job_reason_string(job_ptr->wait_reason));
+				   job_reason_string(job_ptr->state_reason));
 
 	add_display_treestore_line(update, treestore, &iter, 
 				   find_col_name(display_data_job,
@@ -1661,7 +1661,7 @@ static void _update_job_record(sview_job_info_t *sview_job_info_ptr,
 			   SORTID_FEATURES, job_ptr->features, -1);
 	gtk_tree_store_set(treestore, iter,
 			   SORTID_REASON,
-			   job_reason_string(job_ptr->wait_reason), -1);
+			   job_reason_string(job_ptr->state_reason), -1);
 	gtk_tree_store_set(treestore, iter,
 			   SORTID_NETWORK, job_ptr->network, -1);
 	gtk_tree_store_set(treestore, iter,
-- 
GitLab