From 90e2347b54c5b257a5d0b48faa2e27ea14339fe3 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Fri, 28 Oct 2016 15:37:28 -0600 Subject: [PATCH] Add on to commit be924b88eb3cd. This fixes the situation completely where a job could be accounted for more than it should in the _decay_thread inside the priority/multifactor plugin. Before the end_time_exp wasn't stored for the job which was what was used to determine if the job was already processed or not. In 16.05 we were able to fix this mostly, but for the TRES numbers they could get accounted for multiple times. Since a pack was needed to fix this we had to wait until 17.02. --- .../priority/multifactor/priority_multifactor.c | 6 ++++++ src/slurmctld/job_mgr.c | 10 ++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index 75fc0a4797f..eed42a74268 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -928,6 +928,12 @@ static void _init_grp_used_cpu_run_secs(time_t last_ran) if (priority_debug) debug2("job: %u", job_ptr->job_id); + /* If end_time_exp is NO_VAL we have already ran the end for + * this job. We don't want to do it again, so just exit. + */ + if (job_ptr->end_time_exp == (time_t)NO_VAL) + continue; + if (!IS_JOB_RUNNING(job_ptr)) continue; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 4a855bbf549..e2bc4c2e1d2 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1158,6 +1158,7 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer) pack_time(dump_job_ptr->preempt_time, buffer); pack_time(dump_job_ptr->start_time, buffer); pack_time(dump_job_ptr->end_time, buffer); + pack_time(dump_job_ptr->end_time_exp, buffer); pack_time(dump_job_ptr->suspend_time, buffer); pack_time(dump_job_ptr->pre_sus_time, buffer); pack_time(dump_job_ptr->resize_time, buffer); @@ -1274,7 +1275,8 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) uint32_t array_job_id = 0, req_switch = 0, wait4switch = 0; uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET; uint32_t job_state, local_job_id = 0, delay_boot = 0; - time_t start_time, end_time, suspend_time, pre_sus_time, tot_sus_time; + time_t start_time, end_time, end_time_exp, suspend_time, + pre_sus_time, tot_sus_time; time_t preempt_time = 0, deadline = 0; time_t resize_time = 0, now = time(NULL); uint8_t reboot = 0, power_flags = 0; @@ -1384,6 +1386,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) safe_unpack_time(&preempt_time, buffer); safe_unpack_time(&start_time, buffer); safe_unpack_time(&end_time, buffer); + safe_unpack_time(&end_time_exp, buffer); safe_unpack_time(&suspend_time, buffer); safe_unpack_time(&pre_sus_time, buffer); safe_unpack_time(&resize_time, buffer); @@ -1582,6 +1585,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) safe_unpack_time(&preempt_time, buffer); safe_unpack_time(&start_time, buffer); safe_unpack_time(&end_time, buffer); + end_time_exp = end_time; safe_unpack_time(&suspend_time, buffer); safe_unpack_time(&pre_sus_time, buffer); safe_unpack_time(&resize_time, buffer); @@ -1775,6 +1779,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) safe_unpack_time(&preempt_time, buffer); safe_unpack_time(&start_time, buffer); safe_unpack_time(&end_time, buffer); + end_time_exp = end_time; safe_unpack_time(&suspend_time, buffer); safe_unpack_time(&pre_sus_time, buffer); safe_unpack_time(&resize_time, buffer); @@ -1988,7 +1993,8 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) job_ptr->direct_set_prio = direct_set_prio; job_ptr->db_index = db_index; job_ptr->derived_ec = derived_ec; - job_ptr->end_time_exp = job_ptr->end_time = end_time; + job_ptr->end_time_exp = end_time_exp; + job_ptr->end_time = end_time; job_ptr->exit_code = exit_code; job_ptr->group_id = group_id; job_ptr->job_state = job_state; -- GitLab