diff --git a/NEWS b/NEWS index f2a5db4344634f276de1edbf6997692a828a3e52..c491753aea8545d3a8cb9f8a9527ddb1719f1ada 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,14 @@ documents those changes that are of interest to users and admins. -- For Moab, sbatch --get-user-env option processed by slurmd daemon rather than the sbatch command itself to permit faster response for Moab. + -- We are not saving a pending job's task distribution, so after restarting + slurmctld select/cons_res was over-allocating resources based upon an + uninitialized distribution value. Since we can't save the value without + changing the state save file format, we'll just set it to the default + value for now. This will result in an incorrect task distribution for + jobs that had a task distribution that was not the default and were + pending when the slurmctld daemon restarted, but at least resources + will not be over-allocated. * Changes in SLURM 1.3.3 ======================== diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 50d9f7a00a6bddf7eb830bcd65d59c88dc1342c3..2afd1034e1e36a90e445a629df9fc789671ef56c 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -2640,11 +2640,13 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, error_code = cr_plane_dist(job, mc_ptr->plane_size, cr_type); break; case SLURM_DIST_ARBITRARY: - default: error_code = compute_c_b_task_dist(job); - if (error_code != SLURM_SUCCESS) { - error(" Error in compute_c_b_task_dist"); - } + if (error_code != SLURM_SUCCESS) + error("Error in compute_c_b_task_dist"); + break; + default: + error("select/cons_res: invalid dist_type"); + error_code = SLURM_ERROR; break; } } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 135484cb5d1cd16d7d7d2d93f272c97475fc1f60..b42d529967bd7349292b78563f23339d54f41efd 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -893,6 +893,8 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer) job_ptr->details->acctg_freq = acctg_freq; job_ptr->details->contiguous = contiguous; job_ptr->details->cpus_per_task = cpus_per_task; + /* FIXME: Need to save/restore actual task_dist value */ + job_ptr->details->task_dist = SLURM_DIST_CYCLIC; job_ptr->details->ntasks_per_node = ntasks_per_node; job_ptr->details->job_min_procs = job_min_procs; job_ptr->details->job_min_memory = job_min_memory;