diff --git a/RELEASE_NOTES b/RELEASE_NOTES index c25cfb9fbd820b4a44f0d014b1d7a9cb59600d15..a3139cb2b507a89783bd8fa59656f7f1b095f606 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -29,6 +29,9 @@ of Slurm will result in loss of state information. If using SPANK plugins that use the Slurm APIs, they should be recompiled when upgrading Slurm to a new major release. +NOTE: The slurmctld is now set to fatal if there are any problems with + any state files. To avoid this use the new '-i' flag. + NOTE: systemd services files are installed automatically, but not enabled. You will need to manually enable them on the appropriate systems: - Controller: systemctl enable slurmctld @@ -48,6 +51,8 @@ NOTE: If you interact with any memory values in a job_submit plugin, you will need to test against NO_VAL64 instead of NO_VAL, and change your printf format as well. + + HIGHLIGHTS ========== -- Add the configure option --with-shared-libslurm which will link to diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index c7032677ea302763074e91e944e8cf573f00666a..ffbc81e92a66dd2cb88cbbce5b2386d97b639a8b 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -5366,7 +5366,9 @@ extern int load_assoc_usage(char *state_save_location) state_fd = open(state_file, O_RDONLY); if (state_fd < 0) { debug2("No Assoc usage file (%s) to recover", state_file); - goto unpack_error; + xfree(state_file); + assoc_mgr_unlock(&locks); + return ENOENT; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -5396,9 +5398,13 @@ extern int load_assoc_usage(char *state_save_location) safe_unpack16(&ver, buffer); debug3("Version in assoc_usage header is %u", ver); if (ver > SLURM_PROTOCOL_VERSION || ver < SLURM_MIN_PROTOCOL_VERSION) { + if (!ignore_state_errors) + fatal("Can not recover assoc_usage state, incompatible version, got %u need >= %u <= %u, start with '-i' to ignore this", + ver, SLURM_MIN_PROTOCOL_VERSION, + SLURM_PROTOCOL_VERSION); error("***********************************************"); error("Can not recover assoc_usage state, " - "incompatible version, got %u need > %u <= %u", ver, + "incompatible version, got %u need >= %u <= %u", ver, SLURM_MIN_PROTOCOL_VERSION, SLURM_PROTOCOL_VERSION); error("***********************************************"); free_buf(buffer); @@ -5453,6 +5459,9 @@ extern int load_assoc_usage(char *state_save_location) return SLURM_SUCCESS; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete assoc usage state file, start with '-i' to ignore this"); + error("Incomplete assoc usage state file"); if (buffer) free_buf(buffer); xfree(tmp_str); @@ -5484,7 +5493,9 @@ extern int load_qos_usage(char *state_save_location) state_fd = open(state_file, O_RDONLY); if (state_fd < 0) { debug2("No Qos usage file (%s) to recover", state_file); - goto unpack_error; + xfree(state_file); + assoc_mgr_unlock(&locks); + return ENOENT; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -5514,6 +5525,10 @@ extern int load_qos_usage(char *state_save_location) safe_unpack16(&ver, buffer); debug3("Version in qos_usage header is %u", ver); if (ver > SLURM_PROTOCOL_VERSION || ver < SLURM_MIN_PROTOCOL_VERSION) { + if (!ignore_state_errors) + fatal("Can not recover qos_usage state, incompatible version, " + "got %u need >= %u <= %u, start with '-i' to ignore this", + ver, SLURM_MIN_PROTOCOL_VERSION, SLURM_PROTOCOL_VERSION); error("***********************************************"); error("Can not recover qos_usage state, " "incompatible version, got %u need > %u <= %u", ver, @@ -5559,6 +5574,9 @@ extern int load_qos_usage(char *state_save_location) return SLURM_SUCCESS; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete QOS usage state file, start with '-i' to ignore this"); + error("Incomplete QOS usage state file"); if (buffer) free_buf(buffer); if (itr) @@ -5591,7 +5609,9 @@ extern int load_assoc_mgr_state(char *state_save_location) state_fd = open(state_file, O_RDONLY); if (state_fd < 0) { debug2("No association state file (%s) to recover", state_file); - goto unpack_error; + xfree(state_file); + assoc_mgr_unlock(&locks); + return ENOENT; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -5621,6 +5641,10 @@ extern int load_assoc_mgr_state(char *state_save_location) safe_unpack16(&ver, buffer); debug3("Version in assoc_mgr_state header is %u", ver); if (ver > SLURM_PROTOCOL_VERSION || ver < SLURM_MIN_PROTOCOL_VERSION) { + if (!ignore_state_errors) + fatal("Can not recover assoc_mgr state, incompatible version, " + "got %u need >= %u <= %u, start with '-i' to ignore this", + ver, SLURM_MIN_PROTOCOL_VERSION, SLURM_PROTOCOL_VERSION); error("***********************************************"); error("Can not recover assoc_mgr state, incompatible version, " "got %u need > %u <= %u", ver, @@ -5759,6 +5783,9 @@ extern int load_assoc_mgr_state(char *state_save_location) return SLURM_SUCCESS; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete assoc mgr state file, start with '-i' to ignore this"); + error("Incomplete assoc mgr state file"); if (buffer) free_buf(buffer); assoc_mgr_unlock(&locks); diff --git a/src/plugins/burst_buffer/cray/burst_buffer_cray.c b/src/plugins/burst_buffer/cray/burst_buffer_cray.c index 9667dd70d2c82f3559f73edc6335aaa0db8d2faf..c72933a80842ddcc99d7e747965a95ee5ac17bfa 100644 --- a/src/plugins/burst_buffer/cray/burst_buffer_cray.c +++ b/src/plugins/burst_buffer/cray/burst_buffer_cray.c @@ -931,6 +931,8 @@ static void _recover_bb_state(void) buffer = create_buf(data, data_size); safe_unpack16(&protocol_version, buffer); if (protocol_version == (uint16_t)NO_VAL) { + if (!ignore_state_errors) + fatal("Can not recover burst_buffer/cray state, data version incompatible, start with '-i' to ignore this"); error("******************************************************************"); error("Can not recover burst_buffer/cray state, data version incompatible"); error("******************************************************************"); @@ -1002,6 +1004,8 @@ static void _recover_bb_state(void) return; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete burst buffer data checkpoint file, start with '-i' to ignore this"); error("Incomplete burst buffer data checkpoint file"); xfree(account); xfree(name); diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index a4f9d9ba42f93b757629de64d74051c540e0f710..4bfd2338242f3c09de8dcc8bb8e19e7a373e2799 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -314,6 +314,8 @@ static void _read_last_decay_ran(time_t *last_ran, time_t *last_reset) return; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete priority last decay file exiting, start with '-i' to ignore this"); error("Incomplete priority last decay file returning"); free_buf(buffer); return; diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index 0a6b95cca6e72a5a788d58e2772a71195e129da1..4c0a8a35cbd5fad4bedf2b68bd31036d3ed6ccd0 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -654,6 +654,8 @@ static int _load_state_file(List curr_block_list, char *dir_name) safe_unpack16(&protocol_version, buffer); if (protocol_version == (uint16_t)NO_VAL) { + if (!ignore_state_errors) + fatal("Can not recover block state, data version incompatible, start with '-i' to ignore this"); error("***********************************************"); error("Can not recover block state, " "data version incompatible"); @@ -809,9 +811,11 @@ static int _load_state_file(List curr_block_list, char *dir_name) return SLURM_SUCCESS; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete block data checkpoint file, start with '-i' to ignore this"); + error("Incomplete block data checkpoint file"); FREE_NULL_BITMAP(usable_mp_bitmap); slurm_mutex_unlock(&block_state_mutex); - error("Incomplete block data checkpoint file"); free_buf(buffer); return SLURM_FAILURE; diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index 5bb2b89658da7a6b2f0dc2207a9564d766da7ac5..c92114c78353bf4f4a292ef4c620033ffbaf8a9c 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -1469,6 +1469,8 @@ extern int select_p_state_restore(char *dir_name) debug3("Version in blade_state header is %u", protocol_version); if (protocol_version == (uint16_t)NO_VAL) { + if (!ignore_state_errors) + fatal("Can not recover blade state, data version incompatible, start with '-i' to ignore this"); error("***********************************************"); error("Can not recover blade state, " "data version incompatible"); @@ -1562,6 +1564,8 @@ extern int select_p_state_restore(char *dir_name) unpack_error: slurm_mutex_unlock(&blade_mutex); + if (!ignore_state_errors) + fatal("Incomplete blade data checkpoint file, you may get unexpected issues if jobs were running. Start with '-i' to ignore this"); error("Incomplete blade data checkpoint file, you may get " "unexpected issues if jobs were running."); free_buf(buffer); diff --git a/src/plugins/slurmctld/nonstop/do_work.c b/src/plugins/slurmctld/nonstop/do_work.c index fac3e9c6be60e09a1485a578c63235f7d04bc38c..6d6afe49269b8b9372ed03fbb7fff1274ca777a7 100644 --- a/src/plugins/slurmctld/nonstop/do_work.c +++ b/src/plugins/slurmctld/nonstop/do_work.c @@ -419,6 +419,8 @@ extern int restore_nonstop_state(void) debug3("Version in slurmctld/nonstop header is %u", protocol_version); if (protocol_version == (uint16_t) NO_VAL) { + if (!ignore_state_errors) + fatal("Can not recover slurmctld/nonstop state, incompatible version, start with '-i' to ignore this"); error("*************************************************************"); error("Can not recover slurmctld/nonstop state, incompatible version"); error("*************************************************************"); @@ -445,6 +447,8 @@ extern int restore_nonstop_state(void) return error_code; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete nonstop state file, start with '-i' to ignore this"); error("Incomplete nonstop state file"); free_buf(buffer); return SLURM_FAILURE; diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 23dba5b52a0aa873e8c5e3cc1c5992b0478d3885..e274f02db5a1c830301a83c133950ef1b0ffbcfd 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -2086,9 +2086,7 @@ extern void ctld_assoc_mgr_init(slurm_trigger_callbacks_t *callbacks) } /* Now load the usage from a flat file since it isn't kept in - the database No need to check for an error since if this - fails we will get an error message and we will go on our - way. If we get an error we can't do anything about it. + the database */ load_assoc_usage(slurmctld_conf.state_save_location); load_qos_usage(slurmctld_conf.state_save_location); diff --git a/src/slurmctld/fed_mgr.c b/src/slurmctld/fed_mgr.c index e458d357783245ab2f6fa5dd4a3b1d3211c96561..fe85343d3e615b04791906572fc20d55fbaa24f7 100644 --- a/src/slurmctld/fed_mgr.c +++ b/src/slurmctld/fed_mgr.c @@ -2481,6 +2481,10 @@ static slurmdb_federation_rec_t *_state_load(char *state_save_location, debug3("Version in fed_mgr_state header is %u", ver); if (ver > SLURM_PROTOCOL_VERSION || ver < SLURM_MIN_PROTOCOL_VERSION) { + if (!ignore_state_errors) + fatal("Can not recover fed_mgr state, incompatible version, got %u need > %u <= %u, start with '-i' to ignore this", + ver, SLURM_MIN_PROTOCOL_VERSION, + SLURM_PROTOCOL_VERSION); error("***********************************************"); error("Can not recover fed_mgr state, incompatible version, " "got %u need > %u <= %u", ver, @@ -2540,6 +2544,9 @@ static slurmdb_federation_rec_t *_state_load(char *state_save_location, return ret_fed; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete fed_mgr state file, start with '-i' to ignore this"); + error("Incomplete fed_mgr state file"); free_buf(buffer); return NULL; diff --git a/src/slurmctld/front_end.c b/src/slurmctld/front_end.c index c0587f7eca833c75e1c269c12f8784f470ea1aa5..3a464d473236ca434597f1ef63657385de379e48 100644 --- a/src/slurmctld/front_end.c +++ b/src/slurmctld/front_end.c @@ -789,8 +789,10 @@ extern int load_all_front_end_state(bool state_only) lock_state_files (); state_fd = _open_front_end_state_file(&state_file); if (state_fd < 0) { - info ("No node state file (%s) to recover", state_file); - error_code = ENOENT; + info("No node state file (%s) to recover", state_file); + xfree(state_file); + unlock_state_files(); + return ENOENT; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -823,6 +825,8 @@ extern int load_all_front_end_state(bool state_only) safe_unpack16(&protocol_version, buffer); if (protocol_version == (uint16_t) NO_VAL) { + if (!ignore_state_errors) + fatal("Can not recover front_end state, version incompatible, start with '-i' to ignore this"); error("*****************************************************"); error("Can not recover front_end state, version incompatible"); error("*****************************************************"); @@ -918,6 +922,8 @@ fini: info("Recovered state of %d front_end nodes", node_cnt); return error_code; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete front_end node data checkpoint file, start with '-i' to ignore this"); error("Incomplete front_end node data checkpoint file"); error_code = EFAULT; xfree (node_name); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 187040db2f38504b2aaedfcbce25b83a3372c08d..db271ce876483088855bd3c41f92a85926e45767 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -935,7 +935,9 @@ extern int load_all_job_state(void) state_fd = _open_job_state_file(&state_file); if (state_fd < 0) { info("No job state file (%s) to recover", state_file); - error_code = ENOENT; + xfree(state_file); + unlock_state_files(); + return ENOENT; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -973,6 +975,8 @@ extern int load_all_job_state(void) xfree(ver_str); if (protocol_version == (uint16_t)NO_VAL) { + if (!ignore_state_errors) + fatal("Can not recover job state, incompatible version, start with '-i' to ignore this"); error("***********************************************"); error("Can not recover job state, incompatible version"); error("***********************************************"); @@ -1002,6 +1006,8 @@ extern int load_all_job_state(void) unpack_error: assoc_mgr_unlock(&locks); + if (!ignore_state_errors) + fatal("Incomplete job state save file, start with '-i' to ignore this"); error("Incomplete job state save file"); info("Recovered information about %d jobs", job_cnt); free_buf(buffer); @@ -1032,7 +1038,9 @@ extern int load_last_job_id( void ) state_fd = open(state_file, O_RDONLY); if (state_fd < 0) { debug("No job state file (%s) to recover", state_file); - error_code = ENOENT; + xfree(state_file); + unlock_state_files(); + return ENOENT; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -1069,6 +1077,8 @@ extern int load_last_job_id( void ) xfree(ver_str); if (protocol_version == (uint16_t)NO_VAL) { + if (!ignore_state_errors) + fatal("Can not recover last job ID, incompatible version, start with '-i' to ignore this"); debug("*************************************************"); debug("Can not recover last job ID, incompatible version"); debug("*************************************************"); @@ -1087,7 +1097,9 @@ extern int load_last_job_id( void ) return error_code; unpack_error: - debug("Invalid job data checkpoint file"); + if (!ignore_state_errors) + fatal("Invalid job data checkpoint file, start with '-i' to ignore this"); + error("Invalid job data checkpoint file"); xfree(ver_str); free_buf(buffer); return SLURM_FAILURE; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 7046473d390672657314d022c38a10770d164866..a329d5d88fac324c21b97fae8356822365a57de3 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -310,8 +310,10 @@ extern int load_all_node_state ( bool state_only ) lock_state_files (); state_fd = _open_node_state_file(&state_file); if (state_fd < 0) { - info ("No node state file (%s) to recover", state_file); - error_code = ENOENT; + info("No node state file (%s) to recover", state_file); + xfree(state_file); + unlock_state_files(); + return ENOENT; } else { data_allocated = BUF_SIZE; @@ -345,6 +347,8 @@ extern int load_all_node_state ( bool state_only ) safe_unpack16(&protocol_version, buffer); if (!protocol_version || (protocol_version == (uint16_t)NO_VAL)) { + if (!ignore_state_errors) + fatal("Can not recover node state, data version incompatible, start with '-i' to ignore this"); error("*****************************************************"); error("Can not recover node state, data version incompatible"); error("*****************************************************"); @@ -706,6 +710,8 @@ fini: info("Recovered state of %d nodes", node_cnt); return error_code; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete node data checkpoint file, start with '-i' to ignore this"); error("Incomplete node data checkpoint file"); error_code = EFAULT; xfree(features); diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c index dde0619f0558f5976f12a467b664953eb40a4433..2a9ba42e8cf4523653d3cfabdef489b6b3b983fe 100644 --- a/src/slurmctld/partition_mgr.c +++ b/src/slurmctld/partition_mgr.c @@ -602,7 +602,9 @@ int load_all_part_state(void) if (state_fd < 0) { info("No partition state file (%s) to recover", state_file); - error_code = ENOENT; + xfree(state_file); + unlock_state_files(); + return ENOENT; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -636,6 +638,8 @@ int load_all_part_state(void) safe_unpack16(&protocol_version, buffer); if (protocol_version == (uint16_t)NO_VAL) { + if (!ignore_state_errors) + fatal("Can not recover partition state, data version incompatible, start with '-i' to ignore this"); error("**********************************************************"); error("Can not recover partition state, data version incompatible"); error("**********************************************************"); @@ -855,7 +859,9 @@ int load_all_part_state(void) free_buf(buffer); return error_code; - unpack_error: +unpack_error: + if (!ignore_state_errors) + fatal("Incomplete partition data checkpoint file, start with '-i' to ignore this"); error("Incomplete partition data checkpoint file"); info("Recovered state of %d partitions", part_cnt); free_buf(buffer); diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 402e5650c3738adc349beb8d441f64b39d8ea983..ce0279f3c9ae4a807b9b51a788b37a67ce449420 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -2485,6 +2485,8 @@ extern int load_config_state_lite(void) state_fd = open(state_file, O_RDONLY); if (state_fd < 0) { debug2("No last_config_lite file (%s) to recover", state_file); + xfree(state_file); + return ENOENT; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -2514,6 +2516,10 @@ extern int load_config_state_lite(void) safe_unpack16(&ver, buffer); debug3("Version in last_conf_lite header is %u", ver); if (ver > SLURM_PROTOCOL_VERSION || ver < SLURM_MIN_PROTOCOL_VERSION) { + if (!ignore_state_errors) + fatal("Can not recover last_conf_lite, incompatible version, (%u not between %d and %d), start with '-i' to ignore this", + ver, SLURM_MIN_PROTOCOL_VERSION, + SLURM_PROTOCOL_VERSION); error("***********************************************"); error("Can not recover last_conf_lite, incompatible version, " "(%u not between %d and %d)", @@ -2538,6 +2544,9 @@ extern int load_config_state_lite(void) return SLURM_SUCCESS; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete last_config_lite checkpoint file, start with '-i' to ignore this"); + error("Incomplete last_config_lite checkpoint file"); if (buffer) free_buf(buffer); diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 6dda65fe26c975d09070994db77f0064992945c3..ad4f10d1ba9894d85d7e19a0bedbf2ffcbcf7f0b 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -3547,7 +3547,9 @@ extern int load_all_resv_state(int recover) if (state_fd < 0) { info("No reservation state file (%s) to recover", state_file); - error_code = ENOENT; + xfree(state_file); + unlock_state_files(); + return ENOENT; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -3581,6 +3583,8 @@ extern int load_all_resv_state(int recover) safe_unpack16(&protocol_version, buffer); if (protocol_version == (uint16_t) NO_VAL) { + if (!ignore_state_errors) + fatal("Can not recover reservation state, data version incompatible, start with '-i' to ignore this"); error("************************************************************"); error("Can not recover reservation state, data version incompatible"); error("************************************************************"); @@ -3607,10 +3611,11 @@ extern int load_all_resv_state(int recover) free_buf(buffer); return error_code; - unpack_error: +unpack_error: + if (!ignore_state_errors) + fatal("Incomplete reservation data checkpoint file, start with '-i' to ignore this"); + error("Incomplete reservation data checkpoint file"); _validate_all_reservations(); - if (state_fd >= 0) - error("Incomplete reservation data checkpoint file"); info("Recovered state of %d reservations", list_count(resv_list)); if (resv_ptr) _del_resv_rec(resv_ptr); diff --git a/src/slurmctld/trigger_mgr.c b/src/slurmctld/trigger_mgr.c index 151d2e66610b4a62308fe9a22e7b18eec5d0c46e..881fd75f4915cca86f1c03800536466b0c2e6005 100644 --- a/src/slurmctld/trigger_mgr.c +++ b/src/slurmctld/trigger_mgr.c @@ -930,6 +930,9 @@ extern void trigger_state_restore(void) state_fd = _open_resv_state_file(&state_file); if (state_fd < 0) { info("No trigger state file (%s) to recover", state_file); + xfree(state_file); + unlock_state_files(); + return; } else { data_allocated = BUF_SIZE; data = xmalloc(data_allocated); @@ -961,6 +964,8 @@ extern void trigger_state_restore(void) safe_unpack16(&protocol_version, buffer); if (protocol_version == (uint16_t) NO_VAL) { + if (!ignore_state_errors) + fatal("Can't recover trigger state, data version incompatible, start with '-i' to ignore this"); error("Can't recover trigger state, data version " "incompatible"); xfree(ver_str); @@ -981,6 +986,8 @@ extern void trigger_state_restore(void) goto fini; unpack_error: + if (!ignore_state_errors) + fatal("Incomplete trigger data checkpoint file, start with '-i' to ignore this"); error("Incomplete trigger data checkpoint file"); fini: verbose("State of %d triggers recovered", trigger_cnt); free_buf(buffer);