From b75c957ae4a27fccd65eb4274c71f166ab4952ea Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 19 Mar 2003 23:40:27 +0000
Subject: [PATCH] Reset job count on node based upon job table, not jobs
 reported by node registration RPC.

---
 src/slurmctld/job_mgr.c   | 21 ++++++++-------------
 src/slurmctld/node_mgr.c  |  1 +
 src/slurmctld/slurmctld.h |  3 ++-
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 39ae5848e56..5de60e60a63 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -2664,7 +2664,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
  *	are actually running, if not clean up the job records and/or node 
  *	records
  * IN node_name - node which should have jobs running
- * IN job_count - number of jobs which should be running on specified node
+ * IN/OUT job_count - number of jobs which should be running on specified node
  * IN job_id_ptr - pointer to array of job_ids that should be on this node
  * IN step_id_ptr - pointer to array of job step ids that should be on node
  */
@@ -2672,7 +2672,7 @@ void
 validate_jobs_on_node(char *node_name, uint32_t * job_count,
 		      uint32_t * job_id_ptr, uint16_t * step_id_ptr)
 {
-	int i, node_inx, jobs_running = 0;
+	int i, node_inx, jobs_on_node;
 	struct node_record *node_ptr;
 	struct job_record *job_ptr;
 
@@ -2683,13 +2683,6 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 	}
 	node_inx = node_ptr - node_record_table_ptr;
 
-	/* If no job is running here, ensure none are assigned to this node */
-	if (*job_count == 0) {
-		debug("Node %s registered with no jobs", node_name);
-		(void) kill_running_job_by_node_name(node_name, true);
-		return;
-	}
-
 	/* Ensure that jobs running are really supposed to be there */
 	for (i = 0; i < *job_count; i++) {
 		job_ptr = find_job_record(job_id_ptr[i]);
@@ -2704,7 +2697,6 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 
 		else if (job_ptr->job_state == JOB_RUNNING) {
 			if (bit_test(job_ptr->node_bitmap, node_inx)) {
-				jobs_running++;
 				debug3("Registered job %u.%u on node %s ",
 				       job_id_ptr[i], step_id_ptr[i], 
 				       node_name);
@@ -2743,9 +2735,12 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 		}
 	}
 
-	if (jobs_running == 0) {	/* *job_count is > 0 */
-		error("resetting job_count on node %s to zero", node_name);
-		*job_count = 0;
+	jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
+
+	if (jobs_on_node != *job_count) {
+		error("resetting job_count on node %s to %d", 
+		      node_name, jobs_on_node);
+		*job_count = jobs_on_node;
 	}
 
 	return;
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index 2a080c72b81..8739a83c040 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -1007,6 +1007,7 @@ int update_node ( update_node_msg_t * update_node_msg )
  * IN cpus - number of cpus measured
  * IN real_memory - mega_bytes of real_memory measured
  * IN tmp_disk - mega_bytes of tmp_disk measured
+ * IN job_count - number of jobs allocated to this node
  * IN status - node status code
  * RET 0 if no error, ENOENT if no such node, EINVAL if values too low
  * global: node_record_table_ptr - pointer to global node table
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 6ef5624c1c9..70ddb271705 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -969,7 +969,7 @@ extern int validate_group (struct part_record *part_ptr, uid_t submit_uid);
  *	records, call this function after validate_node_specs() sets the node 
  *	state properly 
  * IN node_name - node which should have jobs running
- * IN job_count - number of jobs which should be running on specified node
+ * IN/OUT job_count - number of jobs which should be running on specified node
  * IN job_id_ptr - pointer to array of job_ids that should be on this node
  * IN step_id_ptr - pointer to array of job step ids that should be on node
  */
@@ -983,6 +983,7 @@ extern void validate_jobs_on_node ( char *node_name, uint32_t *job_count,
  * IN cpus - number of cpus measured
  * IN real_memory - mega_bytes of real_memory measured
  * IN tmp_disk - mega_bytes of tmp_disk measured
+ * IN job_count - number of jobs allocated to this node
  * IN status - node status code
  * RET 0 if no error, ENOENT if no such node, EINVAL if values too low
  * global: node_record_table_ptr - pointer to global node table
-- 
GitLab