From e5a617468906a44d1fa75c7298b5d503e5bc6877 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Tue, 19 Jan 2016 15:08:39 -0800
Subject: [PATCH] Correct handling of front-end running job count

The counter is really intended to reflect the count of running or
  suspended jobs rather than running jobs alone. Previous logic
  would report an underflow for the "job_cnt_run" variable if
  1. job submitted
  2. job suspended
  3. scontrol reconfig
  4. job cancelled
---
 src/slurmctld/front_end.c   | 3 ++-
 src/slurmctld/job_mgr.c     | 3 ++-
 src/slurmctld/read_config.c | 2 +-
 src/slurmctld/slurmctld.h   | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/slurmctld/front_end.c b/src/slurmctld/front_end.c
index 2fe61400c7b..0682786eea2 100644
--- a/src/slurmctld/front_end.c
+++ b/src/slurmctld/front_end.c
@@ -1006,7 +1006,8 @@ extern void sync_front_end_state(void)
 				     job_ptr->batch_host);
 			} else if (IS_JOB_COMPLETING(job_ptr)) {
 				job_ptr->front_end_ptr->job_cnt_comp++;
-			} else if (IS_JOB_RUNNING(job_ptr)) {
+			} else if (IS_JOB_RUNNING(job_ptr) ||
+				   IS_JOB_SUSPENDED(job_ptr)) {
 				job_ptr->front_end_ptr->job_cnt_run++;
 			}
 		} else {
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index d35df8a1e2f..d592d326252 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -13144,9 +13144,10 @@ static void _suspend_job(struct job_record *job_ptr, uint16_t op,
 
 #ifdef HAVE_FRONT_END
 	xassert(job_ptr->batch_host);
-	if (job_ptr->front_end_ptr)
+	if (job_ptr->front_end_ptr) {
 		agent_args->protocol_version =
 			job_ptr->front_end_ptr->protocol_version;
+	}
 	hostlist_push_host(agent_args->hostlist, job_ptr->batch_host);
 	agent_args->node_count = 1;
 #else
diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c
index 194d08a6a09..8906bdef721 100644
--- a/src/slurmctld/read_config.c
+++ b/src/slurmctld/read_config.c
@@ -1955,7 +1955,7 @@ static int _sync_nodes_to_active_job(struct job_record *job_ptr)
 		}
 	}
 
-	if (IS_JOB_RUNNING(job_ptr) && job_ptr->front_end_ptr)
+	if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
 		job_ptr->front_end_ptr->job_cnt_run++;
 
 	return cnt;
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 7b175d33c7f..6bb22a00fb2 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -284,7 +284,7 @@ typedef struct front_end_record {
 	uid_t *deny_uids;		/* zero terminated list of denied users */
 	char *deny_users;		/* denied user string */
 	uint32_t job_cnt_comp;		/* count of completing jobs on node */
-	uint16_t job_cnt_run;		/* count of running jobs on node */
+	uint16_t job_cnt_run;		/* count of running or suspended jobs */
 	time_t last_response;		/* Time of last communication */
 	uint32_t magic;			/* magic cookie to test data integrity */
 	char *name;			/* frontend node name */
-- 
GitLab