From c8c738c5d6b5dcab08e61d0c85af1507b95e3868 Mon Sep 17 00:00:00 2001
From: Brian Christiansen <brian@schedmd.com>
Date: Tue, 19 Jul 2016 16:38:51 -0600
Subject: [PATCH] Allow upgrading jobs to run even if jobid>MaxJobID

New jobs will get job IDs within the new range. Even though the
pre-existing large jobid-jobs will have federated jobids, they will be
treated as local jobs.
---
 NEWS                    |  8 ++++----
 RELEASE_NOTES           |  7 ++++---
 src/slurmctld/job_mgr.c | 20 ++++++--------------
 3 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/NEWS b/NEWS
index 674c5954357..de179e26d4f 100644
--- a/NEWS
+++ b/NEWS
@@ -4,10 +4,10 @@ documents those changes that are of interest to users and administrators.
 * Federation Changes - Put below when merged in.
 ==============================
  -- In order to support federated jobs, the MaxJobID configuration parameter
-    default value has been reduced from 2,147,418,112 to 67,043,328 and it's
-    maximum value is now 67,108,863. ANY JOBS WITH A JOB ID ABOVE 67,108,863
-    WILL BE PURGED WHEN SLURM IS UPGRADED FROM AN OLDER VERSION!
-
+    default value has been reduced from 2,147,418,112 to 67,043,328 and its
+    maximum value is now 67,108,863. Upon upgrading, any pre-existing jobs that
+    have a job ID above the new range will continue to run and new jobs will get
+    job IDs in the new range.
 
 * Changes in Slurm 17.02.0pre1
 ==============================
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index c15ad70f39c..1916f60dc6e 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -33,9 +33,10 @@ upgrading Slurm to a new major release.
 HIGHLIGHTS
 ==========
  -- In order to support federated jobs, the MaxJobID configuration parameter
-    default value has been reduced from 2,147,418,112 to 67,043,328 and it's
-    maximum value is now 67,108,863. ANY JOBS WITH A JOB ID ABOVE 67,108,863
-    WILL BE PURGED WHEN SLURM IS UPGRADED FROM AN OLDER VERSION!
+    default value has been reduced from 2,147,418,112 to 67,043,328 and its
+    maximum value is now 67,108,863. Upon upgrading, any pre-existing jobs that
+    have a job ID above the new range will continue to run and new jobs will get
+    job IDs in the new range.
 
 RPMBUILD CHANGES
 ================
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index f757276ce66..64adb90953b 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -1259,7 +1259,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version)
 	uint32_t resv_id, spank_job_env_size = 0, qos_id, derived_ec = 0;
 	uint32_t array_job_id = 0, req_switch = 0, wait4switch = 0;
 	uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET;
-	uint32_t job_state;
+	uint32_t job_state, local_job_id = 0;
 	time_t start_time, end_time, suspend_time, pre_sus_time, tot_sus_time;
 	time_t preempt_time = 0, deadline = 0;
 	time_t resize_time = 0, now = time(NULL);
@@ -1684,17 +1684,6 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version)
 		goto unpack_error;
 	}
 
-	if (job_id > MAX_JOB_ID) {
-		error("JobID %u can not be recovered, JobID too high",
-		      job_id);
-		job_ptr->job_state = JOB_FAILED;
-		job_ptr->exit_code = 1;
-		job_ptr->state_reason = FAIL_SYSTEM;
-		xfree(job_ptr->state_desc);
-		job_ptr->end_time = now;
-		goto unpack_error;
-	}
-
 	if (((job_state & JOB_STATE_BASE) >= JOB_END) ||
 	    (batch_flag > MAX_BATCH_REQUEUE)) {
 		error("Invalid data for job %u: "
@@ -1712,8 +1701,11 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version)
 		highest_prio = MAX(highest_prio, priority);
 		lowest_prio  = MIN(lowest_prio,  priority);
 	}
-	if (job_id_sequence <= job_id)
-		job_id_sequence = job_id + 1;
+
+	/* base job_id_sequence on local job id */
+	local_job_id = fed_mgr_get_local_id(job_id);
+	if (job_id_sequence <= local_job_id)
+		job_id_sequence = local_job_id + 1;
 
 	xfree(job_ptr->tres_alloc_str);
 	job_ptr->tres_alloc_str = tres_alloc_str;
-- 
GitLab