diff --git a/NEWS b/NEWS index a7ccff28d3ad9c0adf26dd8777567716df1b1b65..d382ddaab91bfcb89a983175c70b266536a9f77e 100644 --- a/NEWS +++ b/NEWS @@ -134,6 +134,9 @@ documents those changes that are of interest to users and admins. -- sched/backfill - If job started with infinite time limit, set its end_time one year in the future. -- Clear record of a job's gres when requeued. + -- Clear QOS GrpUsedCPUs when resetting raw usage if QOS is not using any cpus. + -- Remove log message left over from debugging. + -- When using CR_PACK_NODES fix make --ntasks-per-node work correctly. * Changes in Slurm 14.03.4 ========================== diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index 3badd65fb1ab49e733eec26787c79c54b1447809..a18f8429df720be0e2218ed96a2c0d8c2f6e660a 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -3779,6 +3779,8 @@ extern void assoc_mgr_remove_qos_usage(slurmdb_qos_rec_t *qos) qos->usage->usage_raw = 0; qos->usage->grp_used_wall = 0; + if (!qos->usage->grp_used_cpus) + qos->usage->grp_used_cpu_run_secs = 0; } extern int dump_assoc_mgr_state(char *state_save_location) diff --git a/src/common/slurm_step_layout.c b/src/common/slurm_step_layout.c index d1a863b4f366110138d22f11ab73692e512c73cb..8f213e61efdc1eb2c331c670af4f3222afca72e5 100644 --- a/src/common/slurm_step_layout.c +++ b/src/common/slurm_step_layout.c @@ -437,6 +437,18 @@ static int _init_task_layout(slurm_step_layout_t *step_layout, * cpus_per_task=3) */ cpus[i] = 1; } + + if ((plane_size != (uint16_t)NO_VAL) + && (task_dist != SLURM_DIST_PLANE)) { + /* plane_size when dist != plane is used to + convey ntasks_per_node. Adjust the number + of cpus to reflect that. + */ + uint16_t cpus_per_node = plane_size * cpus_per_task; + if (cpus[i] > cpus_per_node) + cpus[i] = cpus_per_node; + } + //info("got %d cpus", cpus[i]); if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) { /* move to next record */ @@ -568,7 +580,7 @@ static int _task_layout_block(slurm_step_layout_t *step_layout, uint16_t *cpus) } } - /* Pass 3: Spread remainign tasks across all nodes */ + /* Pass 3: Spread remaining tasks across all nodes */ while (task_id < step_layout->task_cnt) { for (i = 0; ((i < step_layout->node_cnt) && (task_id < step_layout->task_cnt)); i++) { diff --git a/src/srun/libsrun/launch.c b/src/srun/libsrun/launch.c index 45005fc3c173589ffe76e65ada3e2c20d49fa26a..e94f3e56806ec592bbc8be8fa4542aafa397faba 100644 --- a/src/srun/libsrun/launch.c +++ b/src/srun/libsrun/launch.c @@ -243,6 +243,8 @@ extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt.distribution; + if (opt.ntasks_per_node != NO_VAL) + job->ctx_params.plane_size = opt.ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; @@ -252,6 +254,8 @@ extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, job->ctx_params.task_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; + if (opt.ntasks_per_node != NO_VAL) + job->ctx_params.plane_size = opt.ntasks_per_node; opt.distribution = job->ctx_params.task_dist; break;