diff --git a/src/slurmctld/backup.c b/src/slurmctld/backup.c index dab44f4e655a3316fd9d9a684826a4b0bb29864c..1d0a84c7187870e5fb9e285190e38b1a6f9d9a9c 100644 --- a/src/slurmctld/backup.c +++ b/src/slurmctld/backup.c @@ -126,9 +126,12 @@ void run_backup(void) if (slurmctld_config.shutdown_time != 0) { info("BackupController terminating"); pthread_join(slurmctld_config.thread_id_sig, NULL); + /* Since pidfile is created as user root (its owner is + * changed to SlurmUser) SlurmUser may not be able to + * remove it, so this is not necessarily an error. */ if (unlink(slurmctld_conf.slurmctld_pidfile) < 0) - error("Unable to remove pidfile '%s': %m", - slurmctld_conf.slurmctld_pidfile); + verbose("Unable to remove pidfile '%s': %m", + slurmctld_conf.slurmctld_pidfile); log_fini(); exit(0); } diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index e8267c6567ee2c4e44ac80e942ff8f4214281552..cff832d9f41f71fe26b313739e0863ba90261060 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -232,13 +232,13 @@ int main(int argc, char *argv[]) SLURM_CONFIG_FILE); abort(); } - info("Running primary controller"); } else { error("this host (%s) not valid controller (%s or %s)", node_name, slurmctld_conf.control_machine, slurmctld_conf.backup_controller); exit(0); } + info("Running as primary controller"); if (switch_state_begin(recover)) { error("switch_state_begin: %m"); @@ -289,9 +289,12 @@ int main(int argc, char *argv[]) break; } + /* Since pidfile is created as user root (its owner is + * changed to SlurmUser) SlurmUser may not be able to + * remove it, so this is not necessarily an error. */ if (unlink(slurmctld_conf.slurmctld_pidfile) < 0) - error("Unable to remove pidfile '%s': %m", - slurmctld_conf.slurmctld_pidfile); + verbose("Unable to remove pidfile '%s': %m", + slurmctld_conf.slurmctld_pidfile); #if MEM_LEAK_TEST /* This should purge all allocated memory, *\ @@ -305,6 +308,8 @@ int main(int argc, char *argv[]) free_slurm_conf(&slurmctld_conf); slurm_api_clear_config(); #endif + + info("Slurmctld shutdown completing"); log_fini(); if (dump_core) @@ -639,9 +644,10 @@ static void *_slurmctld_background(void *no_data) if (slurmctld_config.server_thread_count) info("shutdown server_thread_count=%d", slurmctld_config.server_thread_count); - if (_report_locks_set() == 0) + if (_report_locks_set() == 0) { + info("Saving all slurm state"); save_all_state(); - else + } else error("can not save state, semaphores set"); break; } @@ -903,7 +909,7 @@ static int _shutdown_backup_controller(void) if ((slurmctld_conf.backup_addr == NULL) || (strlen(slurmctld_conf.backup_addr) == 0)) { debug("No backup controller to shutdown"); - return SLURM_PROTOCOL_SUCCESS; + return SLURM_SUCCESS; } slurm_set_addr(&req.address, slurmctld_conf.slurmctld_port, @@ -931,7 +937,7 @@ static int _shutdown_backup_controller(void) * here and give the backup controller time to shutdown */ sleep(2); - return SLURM_PROTOCOL_SUCCESS; + return SLURM_SUCCESS; } /* Reset the job credential key based upon configuration parameters */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 0a2ffcf52fbb7461571161bc56a5306e526b7c4b..4127d665313fa5df486e2741526a7697ede2baa5 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -318,7 +318,7 @@ int load_all_job_state(void) { int data_allocated, data_read = 0, error_code = 0; uint32_t data_size = 0; - int state_fd; + int state_fd, job_cnt = 0; char *data = NULL, *state_file; Buf buffer; time_t buf_time; @@ -337,17 +337,21 @@ int load_all_job_state(void) while (1) { data_read = read(state_fd, &data[data_size], HUGE_BUF_SIZE); - if ((data_read == -1) && (errno == EINTR)) - continue; - if (data_read == 0) /* eof */ + if (data_read < 0) { + if (errno == EINTR) + continue; + else { + error("Read error on %s: %m", + state_file); + break; + } + } else if (data_read == 0) /* eof */ break; data_size += data_read; data_allocated += data_read; xrealloc(data, data_allocated); } close(state_fd); - if (data_read < 0) - error("Error reading file %s: %m", state_file); } xfree(state_file); unlock_state_files(); @@ -362,14 +366,16 @@ int load_all_job_state(void) error_code = _load_job_state(buffer); if (error_code != SLURM_SUCCESS) goto unpack_error; + job_cnt++; } free_buf(buffer); + info("Recovered state of %d jobs", job_cnt); return error_code; unpack_error: error("Incomplete job data checkpoint file"); - error("Job state not completely restored"); + info("State of %d jobs recovered", job_cnt); free_buf(buffer); return SLURM_FAILURE; } @@ -554,6 +560,7 @@ static int _load_job_state(Buf buffer) return SLURM_SUCCESS; unpack_error: + error("Incomplete job data checkpoint file."); xfree(host); xfree(nodes); xfree(partition); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 6ed4fae429a2761f33247bfdce7d2f182e77ccbf..4e5dad52440baed824d7e760e672c5ce375a5caa 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -397,7 +397,7 @@ _dump_node_state (struct node_record *dump_node_ptr, Buf buffer) extern int load_all_node_state ( bool state_only ) { char *node_name, *reason = NULL, *data = NULL, *state_file; - int data_allocated, data_read = 0, error_code = 0; + int data_allocated, data_read = 0, error_code = 0, node_cnt = 0; uint16_t node_state, name_len; uint32_t cpus, real_memory, tmp_disk, data_size = 0; struct node_record *node_ptr; @@ -419,17 +419,21 @@ extern int load_all_node_state ( bool state_only ) data = xmalloc(data_allocated); while (1) { data_read = read (state_fd, &data[data_size], BUF_SIZE); - if ((data_read == -1) && (errno == EINTR)) - continue; - if (data_read == 0) /* eof */ + if (data_read < 0) { + if (errno == EINTR) + continue; + else { + error ("Read error on %s: %m", + state_file); + break; + } + } else if (data_read == 0) /* eof */ break; data_size += data_read; data_allocated += data_read; xrealloc(data, data_allocated); } close (state_fd); - if (data_read < 0) - error ("Read error on %s, %m", state_file); } xfree (state_file); unlock_state_files (); @@ -465,6 +469,7 @@ extern int load_all_node_state ( bool state_only ) node_name); xfree(reason); } else if (state_only) { + node_cnt++; if ((node_ptr->node_state == NODE_STATE_UNKNOWN) && ((node_state == NODE_STATE_DOWN) || (node_state == NODE_STATE_DRAINED) || @@ -475,6 +480,7 @@ extern int load_all_node_state ( bool state_only ) else xfree(reason); } else { + node_cnt++; node_ptr->node_state = node_state; xfree(node_ptr->reason); node_ptr->reason = reason; @@ -486,11 +492,13 @@ extern int load_all_node_state ( bool state_only ) xfree (node_name); } + info ("Recovered state of %d nodes", node_cnt); free_buf (buffer); return error_code; unpack_error: - error ("Incomplete node data checkpoint file. Incomplete restore."); + error ("Incomplete node data checkpoint file"); + info("Recovered state of %d nodes", node_cnt); free_buf (buffer); return EFAULT; } diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c index e469828bb30d5ddfdf4d435dd2f16b7b2602304f..0d3f71af294162f15d8a650a7a9780b6c165613a 100644 --- a/src/slurmctld/partition_mgr.c +++ b/src/slurmctld/partition_mgr.c @@ -346,7 +346,7 @@ int load_all_part_state(void) uint16_t name_len, def_part_flag, root_only, shared, state_up; struct part_record *part_ptr; uint32_t data_size = 0; - int data_allocated, data_read = 0, error_code = 0; + int data_allocated, data_read = 0, error_code = 0, part_cnt = 0; int state_fd; Buf buffer; @@ -364,17 +364,21 @@ int load_all_part_state(void) data = xmalloc(data_allocated); while (1) { data_read = read(state_fd, &data[data_size], BUF_SIZE); - if ((data_read == -1) && (errno == EINTR)) - continue; - if (data_read == 0) /* eof */ + if (data_read < 0) { + if (errno == EINTR) + continue; + else { + error("Read error on %s: %m", + state_file); + break; + } + } else if (data_read == 0) /* eof */ break; data_size += data_read; data_allocated += data_read; xrealloc(data, data_allocated); } close(state_fd); - if (data_read < 0) - error("Error reading file %s: %m", state_file); } xfree(state_file); unlock_state_files(); @@ -414,6 +418,7 @@ int load_all_part_state(void) part_name); if (part_ptr) { + part_cnt++; part_ptr->max_time = max_time; part_ptr->max_nodes = max_nodes; part_ptr->min_nodes = min_nodes; @@ -436,12 +441,13 @@ int load_all_part_state(void) xfree(part_name); } + info("Recovered state of %d partitions", part_cnt); free_buf(buffer); return error_code; unpack_error: - error("Incomplete partition data checkpoint file. " - "State not completely restored"); + error("Incomplete partition data checkpoint file"); + info("Recovered state of %d partitions", part_cnt); free_buf(buffer); return EFAULT; }