Skip to content
Snippets Groups Projects
Commit 5f3d85ce authored by Morris Jette's avatar Morris Jette
Browse files

Merge branch 'slurm-2.5' into slurm-2.6

parents 56076ef8 302d8b3f
No related branches found
No related tags found
No related merge requests found
...@@ -207,6 +207,7 @@ documents those changes that are of interest to users and admins. ...@@ -207,6 +207,7 @@ documents those changes that are of interest to users and admins.
-- Select/cons_res - Correct total CPU count allocated to a job with -- Select/cons_res - Correct total CPU count allocated to a job with
--exclusive and --cpus-per-task options --exclusive and --cpus-per-task options
-- switch/nrt - Don't allocate network resources unless job step has 2+ nodes. -- switch/nrt - Don't allocate network resources unless job step has 2+ nodes.
-- select/cons_res - Avoid extraneous "oversubscribe" error messages.
* Changes in Slurm 2.5.7 * Changes in Slurm 2.5.7
======================== ========================
......
...@@ -134,6 +134,8 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr) ...@@ -134,6 +134,8 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr)
uint32_t n, i, tid, maxtasks, l; uint32_t n, i, tid, maxtasks, l;
uint16_t *avail_cpus; uint16_t *avail_cpus;
job_resources_t *job_res = job_ptr->job_resrcs; job_resources_t *job_res = job_ptr->job_resrcs;
bool log_over_subscribe = true;
if (!job_res || !job_res->cpus) { if (!job_res || !job_res->cpus) {
error("cons_res: _compute_c_b_task_dist given NULL job_ptr"); error("cons_res: _compute_c_b_task_dist given NULL job_ptr");
return SLURM_ERROR; return SLURM_ERROR;
...@@ -146,10 +148,12 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr) ...@@ -146,10 +148,12 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr)
/* ncpus is already set the number of tasks if overcommit is used */ /* ncpus is already set the number of tasks if overcommit is used */
if (!job_ptr->details->overcommit && if (!job_ptr->details->overcommit &&
(job_ptr->details->cpus_per_task > 1)) { (job_ptr->details->cpus_per_task > 1)) {
if (job_ptr->details->ntasks_per_node == 0) if (job_ptr->details->ntasks_per_node == 0) {
maxtasks = maxtasks / job_ptr->details->cpus_per_task; maxtasks = maxtasks / job_ptr->details->cpus_per_task;
else } else {
maxtasks = job_ptr->details->ntasks_per_node * job_res->nhosts; maxtasks = job_ptr->details->ntasks_per_node *
job_res->nhosts;
}
} }
/* Safe guard if the user didn't specified a lower number of /* Safe guard if the user didn't specified a lower number of
...@@ -161,16 +165,20 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr) ...@@ -161,16 +165,20 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr)
} }
if (job_ptr->details->cpus_per_task == 0) if (job_ptr->details->cpus_per_task == 0)
job_ptr->details->cpus_per_task = 1; job_ptr->details->cpus_per_task = 1;
if (job_ptr->details->overcommit)
log_over_subscribe = false;
for (tid = 0, i = job_ptr->details->cpus_per_task ; (tid < maxtasks); for (tid = 0, i = job_ptr->details->cpus_per_task ; (tid < maxtasks);
i += job_ptr->details->cpus_per_task ) { /* cycle counter */ i += job_ptr->details->cpus_per_task ) { /* cycle counter */
bool space_remaining = false; bool space_remaining = false;
if (over_subscribe) { if (over_subscribe && log_over_subscribe) {
/* 'over_subscribe' is a relief valve that guards /* 'over_subscribe' is a relief valve that guards
* against an infinite loop, and it *should* never * against an infinite loop, and it *should* never
* come into play because maxtasks should never be * come into play because maxtasks should never be
* greater than the total number of available cpus * greater than the total number of available cpus
*/ */
error("cons_res: _compute_c_b_task_dist oversubscribe"); error("cons_res: _compute_c_b_task_dist "
"oversubscribe for job %u", job_ptr->job_id);
log_over_subscribe = false /* Log once per job */;
} }
for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) { for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) {
if ((i <= avail_cpus[n]) || over_subscribe) { if ((i <= avail_cpus[n]) || over_subscribe) {
...@@ -200,6 +208,8 @@ static int _compute_plane_dist(struct job_record *job_ptr) ...@@ -200,6 +208,8 @@ static int _compute_plane_dist(struct job_record *job_ptr)
uint32_t n, i, p, tid, maxtasks, l; uint32_t n, i, p, tid, maxtasks, l;
uint16_t *avail_cpus, plane_size = 1; uint16_t *avail_cpus, plane_size = 1;
job_resources_t *job_res = job_ptr->job_resrcs; job_resources_t *job_res = job_ptr->job_resrcs;
bool log_over_subscribe = true;
if (!job_res || !job_res->cpus) { if (!job_res || !job_res->cpus) {
error("cons_res: _compute_plane_dist given NULL job_res"); error("cons_res: _compute_plane_dist given NULL job_res");
return SLURM_ERROR; return SLURM_ERROR;
...@@ -220,16 +230,19 @@ static int _compute_plane_dist(struct job_record *job_ptr) ...@@ -220,16 +230,19 @@ static int _compute_plane_dist(struct job_record *job_ptr)
return SLURM_ERROR; return SLURM_ERROR;
} }
job_res->cpus = xmalloc(job_res->nhosts * sizeof(uint16_t)); job_res->cpus = xmalloc(job_res->nhosts * sizeof(uint16_t));
if (job_ptr->details->overcommit)
log_over_subscribe = false;
for (tid = 0, i = 0; (tid < maxtasks); i++) { /* cycle counter */ for (tid = 0, i = 0; (tid < maxtasks); i++) { /* cycle counter */
bool space_remaining = false; bool space_remaining = false;
if (over_subscribe) { if (over_subscribe && log_over_subscribe) {
/* 'over_subscribe' is a relief valve that guards /* 'over_subscribe' is a relief valve that guards
* against an infinite loop, and it *should* never * against an infinite loop, and it *should* never
* come into play because maxtasks should never be * come into play because maxtasks should never be
* greater than the total number of available cpus * greater than the total number of available cpus
*/ */
error("cons_res: _compute_plane_dist oversubscribe"); error("cons_res: _compute_plane_dist oversubscribe "
"for job %u", job_ptr->job_id);
log_over_subscribe = false /* Log once per job */;
} }
for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) { for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) {
for (p = 0; p < plane_size && (tid < maxtasks); p++) { for (p = 0; p < plane_size && (tid < maxtasks); p++) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment