diff --git a/NEWS b/NEWS index 674c59543575382bda1c4eebc207186e64bdbfc4..de179e26d4f58faee21718c5d11eba5e7cbdf3f2 100644 --- a/NEWS +++ b/NEWS @@ -4,10 +4,10 @@ documents those changes that are of interest to users and administrators. * Federation Changes - Put below when merged in. ============================== -- In order to support federated jobs, the MaxJobID configuration parameter - default value has been reduced from 2,147,418,112 to 67,043,328 and it's - maximum value is now 67,108,863. ANY JOBS WITH A JOB ID ABOVE 67,108,863 - WILL BE PURGED WHEN SLURM IS UPGRADED FROM AN OLDER VERSION! - + default value has been reduced from 2,147,418,112 to 67,043,328 and its + maximum value is now 67,108,863. Upon upgrading, any pre-existing jobs that + have a job ID above the new range will continue to run and new jobs will get + job IDs in the new range. * Changes in Slurm 17.02.0pre1 ============================== diff --git a/RELEASE_NOTES b/RELEASE_NOTES index c15ad70f39ca800841afec9ec7afd727f386e293..1916f60dc6e68edcfcdae9fa8d3c4a9dc6ebbcd3 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -33,9 +33,10 @@ upgrading Slurm to a new major release. HIGHLIGHTS ========== -- In order to support federated jobs, the MaxJobID configuration parameter - default value has been reduced from 2,147,418,112 to 67,043,328 and it's - maximum value is now 67,108,863. ANY JOBS WITH A JOB ID ABOVE 67,108,863 - WILL BE PURGED WHEN SLURM IS UPGRADED FROM AN OLDER VERSION! + default value has been reduced from 2,147,418,112 to 67,043,328 and its + maximum value is now 67,108,863. Upon upgrading, any pre-existing jobs that + have a job ID above the new range will continue to run and new jobs will get + job IDs in the new range. RPMBUILD CHANGES ================ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index f757276ce66131944da233143d54349c84baae46..64adb90953b60d8944d90a9fa2181eec28826edb 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1259,7 +1259,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) uint32_t resv_id, spank_job_env_size = 0, qos_id, derived_ec = 0; uint32_t array_job_id = 0, req_switch = 0, wait4switch = 0; uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET; - uint32_t job_state; + uint32_t job_state, local_job_id = 0; time_t start_time, end_time, suspend_time, pre_sus_time, tot_sus_time; time_t preempt_time = 0, deadline = 0; time_t resize_time = 0, now = time(NULL); @@ -1684,17 +1684,6 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) goto unpack_error; } - if (job_id > MAX_JOB_ID) { - error("JobID %u can not be recovered, JobID too high", - job_id); - job_ptr->job_state = JOB_FAILED; - job_ptr->exit_code = 1; - job_ptr->state_reason = FAIL_SYSTEM; - xfree(job_ptr->state_desc); - job_ptr->end_time = now; - goto unpack_error; - } - if (((job_state & JOB_STATE_BASE) >= JOB_END) || (batch_flag > MAX_BATCH_REQUEUE)) { error("Invalid data for job %u: " @@ -1712,8 +1701,11 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) highest_prio = MAX(highest_prio, priority); lowest_prio = MIN(lowest_prio, priority); } - if (job_id_sequence <= job_id) - job_id_sequence = job_id + 1; + + /* base job_id_sequence on local job id */ + local_job_id = fed_mgr_get_local_id(job_id); + if (job_id_sequence <= local_job_id) + job_id_sequence = local_job_id + 1; xfree(job_ptr->tres_alloc_str); job_ptr->tres_alloc_str = tres_alloc_str;