diff --git a/NEWS b/NEWS index d0f49194fe3b40f63952a7a4c0810a58d2347c01..660dccbd1537c263257c5b4930b062d3e7bf71a7 100644 --- a/NEWS +++ b/NEWS @@ -149,6 +149,8 @@ documents those changes that are of interest to users and admins. comes last. -- When attempting to requeue a job distinguish the case in which the job is JOB_COMPLETING or already pending. + -- When reconfiguring the controller don't restart the slurmctld epilog if it + is already running. * Changes in Slurm 14.03.3-2 ============================ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index cf90d4274723517de52f2e52dc684ac2c069ca3f..80196e9cba060c6163f06c074cf1c6ba5f07eeae 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3174,7 +3174,10 @@ struct job_record *_job_rec_copy(struct job_record *job_ptr) details_new->std_in = xstrdup(job_details->std_in); details_new->std_out = xstrdup(job_details->std_out); details_new->work_dir = xstrdup(job_details->work_dir); - _copy_job_desc_files(job_ptr->job_id, job_ptr_new->job_id); + if (_copy_job_desc_files(job_ptr->job_id, job_ptr_new->job_id)) { + _list_delete_job((void *) job_ptr_new); + return NULL; + } return job_ptr_new; } @@ -5034,10 +5037,9 @@ _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id) xstrcat(dir_name, job_dir); if (mkdir(dir_name, 0700)) { if (!slurmctld_primary && (errno == EEXIST)) { - fatal("Apparent duplicate job ID %u. Two primary " - "slurmctld daemons may currently be active. " - "Shutting down this daemon to avoid inconsistent " - "state due to split brain.", job_id); + error("Apparent duplicate job ID %u. Two primary " + "slurmctld daemons might currently be active", + job_id); } error("mkdir(%s) error %m", dir_name); xfree(dir_name); @@ -5065,11 +5067,37 @@ _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id) return error_code; } +/* Return true of the specified job ID already has a batch directory so + * that a different job ID can be created. This is to help limit damage from + * split-brain, where two slurmctld daemons are running as primary. */ +static bool _dup_job_file_test(uint32_t job_id) +{ + char *dir_name_src, job_dir[40]; + struct stat buf; + int rc, hash; + + dir_name_src = slurm_get_state_save_location(); + hash = job_id % 10; + sprintf(job_dir, "/hash.%d", hash); + xstrcat(dir_name_src, job_dir); + sprintf(job_dir, "/job.%u", job_id); + xstrcat(dir_name_src, job_dir); + rc = stat(dir_name_src, &buf); + xfree(dir_name_src); + if (rc == 0) { + error("Vestigial state files for job %u, but no job record. " + "this may be the result of two slurmctld running in " + "primary mode", job_id); + return true; + } + return false; +} + /* _copy_job_desc_files - create copies of a job script and environment files */ static int _copy_job_desc_files(uint32_t job_id_src, uint32_t job_id_dest) { - int error_code = 0, hash; + int error_code = SLURM_SUCCESS, hash; char *dir_name_src, *dir_name_dest, job_dir[40]; char *file_name_src, *file_name_dest; @@ -5090,10 +5118,9 @@ _copy_job_desc_files(uint32_t job_id_src, uint32_t job_id_dest) xstrcat(dir_name_dest, job_dir); if (mkdir(dir_name_dest, 0700)) { if (!slurmctld_primary && (errno == EEXIST)) { - fatal("Apparent duplicate job ID %u. Two primary " - "slurmctld daemons may currently be active. " - "Shutting down this daemon to avoid inconsistent " - "state due to split brain.", job_id_dest); + error("Apparent duplicate job ID %u. Two primary " + "slurmctld daemons might currently be active", + job_id_dest); } error("mkdir(%s) error %m", dir_name_dest); xfree(dir_name_src); @@ -7464,10 +7491,12 @@ static int _set_job_id(struct job_record *job_ptr) if (++job_id_sequence >= slurmctld_conf.max_job_id) job_id_sequence = slurmctld_conf.first_job_id; new_id = job_id_sequence; - if (find_job_record(new_id) == NULL) { - job_ptr->job_id = new_id; - return SLURM_SUCCESS; - } + if (find_job_record(new_id)) + continue; + if (_dup_job_file_test(new_id)) + continue; + job_ptr->job_id = new_id; + return SLURM_SUCCESS; } error("We have exhausted our supply of valid job id values. " "FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id, diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index bca68f8ad804202acc607bd545d0342d09e15c35..718b93f91a7f4613e698f07d3ced6cd437064078 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -1742,6 +1742,17 @@ static int _sync_nodes_to_comp_job(void) job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { if ((job_ptr->node_bitmap) && IS_JOB_COMPLETING(job_ptr)) { + + /* If the controller is reconfiguring + * and the job is in completing state + * and the slurmctld epilog is already + * running which means deallocate_nodes() + * was alredy called, do invoke it again + * and don't start another epilog. + */ + if (job_ptr->epilog_running == true) + continue; + update_cnt++; /* This needs to be set up for the priority plugin and this happens before it is @@ -1750,7 +1761,8 @@ static int _sync_nodes_to_comp_job(void) if (!cluster_cpus) set_cluster_cpus(); - info("Job %u in completing state", job_ptr->job_id); + info("%s: Job %u in completing state", + __func__, job_ptr->job_id); if (!job_ptr->node_bitmap_cg) build_cg_bitmap(job_ptr); deallocate_nodes(job_ptr, false, false, false); @@ -1761,7 +1773,7 @@ static int _sync_nodes_to_comp_job(void) } list_iterator_destroy(job_iterator); if (update_cnt) - info("_sync_nodes_to_comp_job completing %d jobs", update_cnt); + info("%s: completing %d jobs", __func__, update_cnt); return update_cnt; }