Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
5f3d85ce
Commit
5f3d85ce
authored
11 years ago
by
Morris Jette
Browse files
Options
Downloads
Plain Diff
Merge branch 'slurm-2.5' into slurm-2.6
parents
56076ef8
302d8b3f
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
NEWS
+1
-0
1 addition, 0 deletions
NEWS
src/plugins/select/cons_res/dist_tasks.c
+21
-8
21 additions, 8 deletions
src/plugins/select/cons_res/dist_tasks.c
with
22 additions
and
8 deletions
NEWS
+
1
−
0
View file @
5f3d85ce
...
@@ -207,6 +207,7 @@ documents those changes that are of interest to users and admins.
...
@@ -207,6 +207,7 @@ documents those changes that are of interest to users and admins.
-- Select/cons_res - Correct total CPU count allocated to a job with
-- Select/cons_res - Correct total CPU count allocated to a job with
--exclusive and --cpus-per-task options
--exclusive and --cpus-per-task options
-- switch/nrt - Don't allocate network resources unless job step has 2+ nodes.
-- switch/nrt - Don't allocate network resources unless job step has 2+ nodes.
-- select/cons_res - Avoid extraneous "oversubscribe" error messages.
* Changes in Slurm 2.5.7
* Changes in Slurm 2.5.7
========================
========================
...
...
This diff is collapsed.
Click to expand it.
src/plugins/select/cons_res/dist_tasks.c
+
21
−
8
View file @
5f3d85ce
...
@@ -134,6 +134,8 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr)
...
@@ -134,6 +134,8 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr)
uint32_t
n
,
i
,
tid
,
maxtasks
,
l
;
uint32_t
n
,
i
,
tid
,
maxtasks
,
l
;
uint16_t
*
avail_cpus
;
uint16_t
*
avail_cpus
;
job_resources_t
*
job_res
=
job_ptr
->
job_resrcs
;
job_resources_t
*
job_res
=
job_ptr
->
job_resrcs
;
bool
log_over_subscribe
=
true
;
if
(
!
job_res
||
!
job_res
->
cpus
)
{
if
(
!
job_res
||
!
job_res
->
cpus
)
{
error
(
"cons_res: _compute_c_b_task_dist given NULL job_ptr"
);
error
(
"cons_res: _compute_c_b_task_dist given NULL job_ptr"
);
return
SLURM_ERROR
;
return
SLURM_ERROR
;
...
@@ -146,10 +148,12 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr)
...
@@ -146,10 +148,12 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr)
/* ncpus is already set the number of tasks if overcommit is used */
/* ncpus is already set the number of tasks if overcommit is used */
if
(
!
job_ptr
->
details
->
overcommit
&&
if
(
!
job_ptr
->
details
->
overcommit
&&
(
job_ptr
->
details
->
cpus_per_task
>
1
))
{
(
job_ptr
->
details
->
cpus_per_task
>
1
))
{
if
(
job_ptr
->
details
->
ntasks_per_node
==
0
)
if
(
job_ptr
->
details
->
ntasks_per_node
==
0
)
{
maxtasks
=
maxtasks
/
job_ptr
->
details
->
cpus_per_task
;
maxtasks
=
maxtasks
/
job_ptr
->
details
->
cpus_per_task
;
else
}
else
{
maxtasks
=
job_ptr
->
details
->
ntasks_per_node
*
job_res
->
nhosts
;
maxtasks
=
job_ptr
->
details
->
ntasks_per_node
*
job_res
->
nhosts
;
}
}
}
/* Safe guard if the user didn't specified a lower number of
/* Safe guard if the user didn't specified a lower number of
...
@@ -161,16 +165,20 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr)
...
@@ -161,16 +165,20 @@ static int _compute_c_b_task_dist(struct job_record *job_ptr)
}
}
if
(
job_ptr
->
details
->
cpus_per_task
==
0
)
if
(
job_ptr
->
details
->
cpus_per_task
==
0
)
job_ptr
->
details
->
cpus_per_task
=
1
;
job_ptr
->
details
->
cpus_per_task
=
1
;
if
(
job_ptr
->
details
->
overcommit
)
log_over_subscribe
=
false
;
for
(
tid
=
0
,
i
=
job_ptr
->
details
->
cpus_per_task
;
(
tid
<
maxtasks
);
for
(
tid
=
0
,
i
=
job_ptr
->
details
->
cpus_per_task
;
(
tid
<
maxtasks
);
i
+=
job_ptr
->
details
->
cpus_per_task
)
{
/* cycle counter */
i
+=
job_ptr
->
details
->
cpus_per_task
)
{
/* cycle counter */
bool
space_remaining
=
false
;
bool
space_remaining
=
false
;
if
(
over_subscribe
)
{
if
(
over_subscribe
&&
log_over_subscribe
)
{
/* 'over_subscribe' is a relief valve that guards
/* 'over_subscribe' is a relief valve that guards
* against an infinite loop, and it *should* never
* against an infinite loop, and it *should* never
* come into play because maxtasks should never be
* come into play because maxtasks should never be
* greater than the total number of available cpus
* greater than the total number of available cpus
*/
*/
error
(
"cons_res: _compute_c_b_task_dist oversubscribe"
);
error
(
"cons_res: _compute_c_b_task_dist "
"oversubscribe for job %u"
,
job_ptr
->
job_id
);
log_over_subscribe
=
false
/* Log once per job */
;
}
}
for
(
n
=
0
;
((
n
<
job_res
->
nhosts
)
&&
(
tid
<
maxtasks
));
n
++
)
{
for
(
n
=
0
;
((
n
<
job_res
->
nhosts
)
&&
(
tid
<
maxtasks
));
n
++
)
{
if
((
i
<=
avail_cpus
[
n
])
||
over_subscribe
)
{
if
((
i
<=
avail_cpus
[
n
])
||
over_subscribe
)
{
...
@@ -200,6 +208,8 @@ static int _compute_plane_dist(struct job_record *job_ptr)
...
@@ -200,6 +208,8 @@ static int _compute_plane_dist(struct job_record *job_ptr)
uint32_t
n
,
i
,
p
,
tid
,
maxtasks
,
l
;
uint32_t
n
,
i
,
p
,
tid
,
maxtasks
,
l
;
uint16_t
*
avail_cpus
,
plane_size
=
1
;
uint16_t
*
avail_cpus
,
plane_size
=
1
;
job_resources_t
*
job_res
=
job_ptr
->
job_resrcs
;
job_resources_t
*
job_res
=
job_ptr
->
job_resrcs
;
bool
log_over_subscribe
=
true
;
if
(
!
job_res
||
!
job_res
->
cpus
)
{
if
(
!
job_res
||
!
job_res
->
cpus
)
{
error
(
"cons_res: _compute_plane_dist given NULL job_res"
);
error
(
"cons_res: _compute_plane_dist given NULL job_res"
);
return
SLURM_ERROR
;
return
SLURM_ERROR
;
...
@@ -220,16 +230,19 @@ static int _compute_plane_dist(struct job_record *job_ptr)
...
@@ -220,16 +230,19 @@ static int _compute_plane_dist(struct job_record *job_ptr)
return
SLURM_ERROR
;
return
SLURM_ERROR
;
}
}
job_res
->
cpus
=
xmalloc
(
job_res
->
nhosts
*
sizeof
(
uint16_t
));
job_res
->
cpus
=
xmalloc
(
job_res
->
nhosts
*
sizeof
(
uint16_t
));
if
(
job_ptr
->
details
->
overcommit
)
log_over_subscribe
=
false
;
for
(
tid
=
0
,
i
=
0
;
(
tid
<
maxtasks
);
i
++
)
{
/* cycle counter */
for
(
tid
=
0
,
i
=
0
;
(
tid
<
maxtasks
);
i
++
)
{
/* cycle counter */
bool
space_remaining
=
false
;
bool
space_remaining
=
false
;
if
(
over_subscribe
)
{
if
(
over_subscribe
&&
log_over_subscribe
)
{
/* 'over_subscribe' is a relief valve that guards
/* 'over_subscribe' is a relief valve that guards
* against an infinite loop, and it *should* never
* against an infinite loop, and it *should* never
* come into play because maxtasks should never be
* come into play because maxtasks should never be
* greater than the total number of available cpus
* greater than the total number of available cpus
*/
*/
error
(
"cons_res: _compute_plane_dist oversubscribe"
);
error
(
"cons_res: _compute_plane_dist oversubscribe "
"for job %u"
,
job_ptr
->
job_id
);
log_over_subscribe
=
false
/* Log once per job */
;
}
}
for
(
n
=
0
;
((
n
<
job_res
->
nhosts
)
&&
(
tid
<
maxtasks
));
n
++
)
{
for
(
n
=
0
;
((
n
<
job_res
->
nhosts
)
&&
(
tid
<
maxtasks
));
n
++
)
{
for
(
p
=
0
;
p
<
plane_size
&&
(
tid
<
maxtasks
);
p
++
)
{
for
(
p
=
0
;
p
<
plane_size
&&
(
tid
<
maxtasks
);
p
++
)
{
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment