From b75c957ae4a27fccd65eb4274c71f166ab4952ea Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 19 Mar 2003 23:40:27 +0000 Subject: [PATCH] Reset job count on node based upon job table, not jobs reported by node registration RPC. --- src/slurmctld/job_mgr.c | 21 ++++++++------------- src/slurmctld/node_mgr.c | 1 + src/slurmctld/slurmctld.h | 3 ++- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 39ae5848e56..5de60e60a63 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2664,7 +2664,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) * are actually running, if not clean up the job records and/or node * records * IN node_name - node which should have jobs running - * IN job_count - number of jobs which should be running on specified node + * IN/OUT job_count - number of jobs which should be running on specified node * IN job_id_ptr - pointer to array of job_ids that should be on this node * IN step_id_ptr - pointer to array of job step ids that should be on node */ @@ -2672,7 +2672,7 @@ void validate_jobs_on_node(char *node_name, uint32_t * job_count, uint32_t * job_id_ptr, uint16_t * step_id_ptr) { - int i, node_inx, jobs_running = 0; + int i, node_inx, jobs_on_node; struct node_record *node_ptr; struct job_record *job_ptr; @@ -2683,13 +2683,6 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, } node_inx = node_ptr - node_record_table_ptr; - /* If no job is running here, ensure none are assigned to this node */ - if (*job_count == 0) { - debug("Node %s registered with no jobs", node_name); - (void) kill_running_job_by_node_name(node_name, true); - return; - } - /* Ensure that jobs running are really supposed to be there */ for (i = 0; i < *job_count; i++) { job_ptr = find_job_record(job_id_ptr[i]); @@ -2704,7 +2697,6 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, else if (job_ptr->job_state == JOB_RUNNING) { if (bit_test(job_ptr->node_bitmap, node_inx)) { - jobs_running++; debug3("Registered job %u.%u on node %s ", job_id_ptr[i], step_id_ptr[i], node_name); @@ -2743,9 +2735,12 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count, } } - if (jobs_running == 0) { /* *job_count is > 0 */ - error("resetting job_count on node %s to zero", node_name); - *job_count = 0; + jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt; + + if (jobs_on_node != *job_count) { + error("resetting job_count on node %s to %d", + node_name, jobs_on_node); + *job_count = jobs_on_node; } return; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 2a080c72b81..8739a83c040 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1007,6 +1007,7 @@ int update_node ( update_node_msg_t * update_node_msg ) * IN cpus - number of cpus measured * IN real_memory - mega_bytes of real_memory measured * IN tmp_disk - mega_bytes of tmp_disk measured + * IN job_count - number of jobs allocated to this node * IN status - node status code * RET 0 if no error, ENOENT if no such node, EINVAL if values too low * global: node_record_table_ptr - pointer to global node table diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 6ef5624c1c9..70ddb271705 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -969,7 +969,7 @@ extern int validate_group (struct part_record *part_ptr, uid_t submit_uid); * records, call this function after validate_node_specs() sets the node * state properly * IN node_name - node which should have jobs running - * IN job_count - number of jobs which should be running on specified node + * IN/OUT job_count - number of jobs which should be running on specified node * IN job_id_ptr - pointer to array of job_ids that should be on this node * IN step_id_ptr - pointer to array of job step ids that should be on node */ @@ -983,6 +983,7 @@ extern void validate_jobs_on_node ( char *node_name, uint32_t *job_count, * IN cpus - number of cpus measured * IN real_memory - mega_bytes of real_memory measured * IN tmp_disk - mega_bytes of tmp_disk measured + * IN job_count - number of jobs allocated to this node * IN status - node status code * RET 0 if no error, ENOENT if no such node, EINVAL if values too low * global: node_record_table_ptr - pointer to global node table -- GitLab