From c8c738c5d6b5dcab08e61d0c85af1507b95e3868 Mon Sep 17 00:00:00 2001 From: Brian Christiansen <brian@schedmd.com> Date: Tue, 19 Jul 2016 16:38:51 -0600 Subject: [PATCH] Allow upgrading jobs to run even if jobid>MaxJobID New jobs will get job IDs within the new range. Even though the pre-existing large jobid-jobs will have federated jobids, they will be treated as local jobs. --- NEWS | 8 ++++---- RELEASE_NOTES | 7 ++++--- src/slurmctld/job_mgr.c | 20 ++++++-------------- 3 files changed, 14 insertions(+), 21 deletions(-) diff --git a/NEWS b/NEWS index 674c5954357..de179e26d4f 100644 --- a/NEWS +++ b/NEWS @@ -4,10 +4,10 @@ documents those changes that are of interest to users and administrators. * Federation Changes - Put below when merged in. ============================== -- In order to support federated jobs, the MaxJobID configuration parameter - default value has been reduced from 2,147,418,112 to 67,043,328 and it's - maximum value is now 67,108,863. ANY JOBS WITH A JOB ID ABOVE 67,108,863 - WILL BE PURGED WHEN SLURM IS UPGRADED FROM AN OLDER VERSION! - + default value has been reduced from 2,147,418,112 to 67,043,328 and its + maximum value is now 67,108,863. Upon upgrading, any pre-existing jobs that + have a job ID above the new range will continue to run and new jobs will get + job IDs in the new range. * Changes in Slurm 17.02.0pre1 ============================== diff --git a/RELEASE_NOTES b/RELEASE_NOTES index c15ad70f39c..1916f60dc6e 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -33,9 +33,10 @@ upgrading Slurm to a new major release. HIGHLIGHTS ========== -- In order to support federated jobs, the MaxJobID configuration parameter - default value has been reduced from 2,147,418,112 to 67,043,328 and it's - maximum value is now 67,108,863. ANY JOBS WITH A JOB ID ABOVE 67,108,863 - WILL BE PURGED WHEN SLURM IS UPGRADED FROM AN OLDER VERSION! + default value has been reduced from 2,147,418,112 to 67,043,328 and its + maximum value is now 67,108,863. Upon upgrading, any pre-existing jobs that + have a job ID above the new range will continue to run and new jobs will get + job IDs in the new range. RPMBUILD CHANGES ================ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index f757276ce66..64adb90953b 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1259,7 +1259,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) uint32_t resv_id, spank_job_env_size = 0, qos_id, derived_ec = 0; uint32_t array_job_id = 0, req_switch = 0, wait4switch = 0; uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET; - uint32_t job_state; + uint32_t job_state, local_job_id = 0; time_t start_time, end_time, suspend_time, pre_sus_time, tot_sus_time; time_t preempt_time = 0, deadline = 0; time_t resize_time = 0, now = time(NULL); @@ -1684,17 +1684,6 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) goto unpack_error; } - if (job_id > MAX_JOB_ID) { - error("JobID %u can not be recovered, JobID too high", - job_id); - job_ptr->job_state = JOB_FAILED; - job_ptr->exit_code = 1; - job_ptr->state_reason = FAIL_SYSTEM; - xfree(job_ptr->state_desc); - job_ptr->end_time = now; - goto unpack_error; - } - if (((job_state & JOB_STATE_BASE) >= JOB_END) || (batch_flag > MAX_BATCH_REQUEUE)) { error("Invalid data for job %u: " @@ -1712,8 +1701,11 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) highest_prio = MAX(highest_prio, priority); lowest_prio = MIN(lowest_prio, priority); } - if (job_id_sequence <= job_id) - job_id_sequence = job_id + 1; + + /* base job_id_sequence on local job id */ + local_job_id = fed_mgr_get_local_id(job_id); + if (job_id_sequence <= local_job_id) + job_id_sequence = local_job_id + 1; xfree(job_ptr->tres_alloc_str); job_ptr->tres_alloc_str = tres_alloc_str; -- GitLab