Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
dc5925f1
Commit
dc5925f1
authored
15 years ago
by
Moe Jette
Browse files
Options
Downloads
Patches
Plain Diff
In select/cons_res, allocated cores for a job using a best-fit approach.
parent
ca89b39f
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
NEWS
+1
-0
1 addition, 0 deletions
NEWS
src/plugins/select/cons_res/dist_tasks.c
+138
-25
138 additions, 25 deletions
src/plugins/select/cons_res/dist_tasks.c
with
139 additions
and
25 deletions
NEWS
+
1
−
0
View file @
dc5925f1
...
@@ -8,6 +8,7 @@ documents those changes that are of interest to users and admins.
...
@@ -8,6 +8,7 @@ documents those changes that are of interest to users and admins.
-- In select/cons_res, the count of CPUs on required nodes was formerly
-- In select/cons_res, the count of CPUs on required nodes was formerly
ignored in enforcing the maximum CPU limit. Also enforce maximum CPU
ignored in enforcing the maximum CPU limit. Also enforce maximum CPU
limit when the topology/tree plugin is configured (previously ignored).
limit when the topology/tree plugin is configured (previously ignored).
-- In select/cons_res, allocated cores for a job using a best-fit approach.
* Changes in SLURM 2.2.0.pre2
* Changes in SLURM 2.2.0.pre2
=============================
=============================
...
...
This diff is collapsed.
Click to expand it.
src/plugins/select/cons_res/dist_tasks.c
+
138
−
25
View file @
dc5925f1
...
@@ -165,7 +165,8 @@ static int _compute_plane_dist(struct job_record *job_ptr)
...
@@ -165,7 +165,8 @@ static int _compute_plane_dist(struct job_record *job_ptr)
return
SLURM_SUCCESS
;
return
SLURM_SUCCESS
;
}
}
/* sync up core bitmap with new CPU count
/* sync up core bitmap with new CPU count using a best-fit approach
* on the available sockets
*
*
* The CPU array contains the distribution of CPUs, which can include
* The CPU array contains the distribution of CPUs, which can include
* virtual CPUs (hyperthreads)
* virtual CPUs (hyperthreads)
...
@@ -173,11 +174,19 @@ static int _compute_plane_dist(struct job_record *job_ptr)
...
@@ -173,11 +174,19 @@ static int _compute_plane_dist(struct job_record *job_ptr)
static
void
_block_sync_core_bitmap
(
struct
job_record
*
job_ptr
,
static
void
_block_sync_core_bitmap
(
struct
job_record
*
job_ptr
,
const
uint16_t
cr_type
)
const
uint16_t
cr_type
)
{
{
uint32_t
c
,
i
,
n
,
size
,
csize
,
core_cnt
;
uint32_t
c
,
s
,
i
,
j
,
n
,
size
,
csize
,
core_cnt
;
uint16_t
cpus
,
num_bits
,
vpus
=
1
;
uint16_t
cpus
,
num_bits
,
vpus
=
1
;
job_resources_t
*
job_res
=
job_ptr
->
job_resrcs
;
job_resources_t
*
job_res
=
job_ptr
->
job_resrcs
;
bool
alloc_cores
=
false
,
alloc_sockets
=
false
;
bool
alloc_cores
=
false
,
alloc_sockets
=
false
;
uint16_t
ntasks_per_core
=
0xffff
;
uint16_t
ntasks_per_core
=
0xffff
;
int
*
sockets_cpu_cnt
;
bool
*
sockets_used
;
uint16_t
sockets_nb
;
uint16_t
ncores_nb
;
uint16_t
nsockets_nb
;
uint16_t
req_cpus
,
best_fit_cpus
=
0
;
uint32_t
best_fit_location
=
0
;
bool
sufficient
,
best_fit_sufficient
;
if
(
!
job_res
)
if
(
!
job_res
)
return
;
return
;
...
@@ -198,53 +207,157 @@ static void _block_sync_core_bitmap(struct job_record *job_ptr,
...
@@ -198,53 +207,157 @@ static void _block_sync_core_bitmap(struct job_record *job_ptr,
size
=
bit_size
(
job_res
->
node_bitmap
);
size
=
bit_size
(
job_res
->
node_bitmap
);
csize
=
bit_size
(
job_res
->
core_bitmap
);
csize
=
bit_size
(
job_res
->
core_bitmap
);
sockets_nb
=
select_node_record
[
0
].
sockets
;
sockets_cpu_cnt
=
xmalloc
(
sockets_nb
*
sizeof
(
int
));
sockets_used
=
xmalloc
(
sockets_nb
*
sizeof
(
bool
));
for
(
c
=
0
,
i
=
0
,
n
=
0
;
n
<
size
;
n
++
)
{
for
(
c
=
0
,
i
=
0
,
n
=
0
;
n
<
size
;
n
++
)
{
if
(
bit_test
(
job_res
->
node_bitmap
,
n
)
==
0
)
if
(
bit_test
(
job_res
->
node_bitmap
,
n
)
==
0
)
continue
;
continue
;
core_cnt
=
0
;
core_cnt
=
0
;
num_bits
=
select_node_record
[
n
].
sockets
*
ncores_nb
=
select_node_record
[
n
].
cores
;
select_node_record
[
n
].
cores
;
nsockets_nb
=
select_node_record
[
n
].
sockets
;
num_bits
=
nsockets_nb
*
ncores_nb
;
if
((
c
+
num_bits
)
>
csize
)
if
((
c
+
num_bits
)
>
csize
)
fatal
(
"cons_res: _block_sync_core_bitmap index error"
);
fatal
(
"cons_res: _block_sync_core_bitmap index error"
);
cpus
=
job_res
->
cpus
[
i
];
cpus
=
job_res
->
cpus
[
i
];
vpus
=
MIN
(
select_node_record
[
n
].
vpus
,
ntasks_per_core
);
vpus
=
MIN
(
select_node_record
[
n
].
vpus
,
ntasks_per_core
);
while
((
cpus
>
0
)
&&
(
num_bits
>
0
))
{
if
(
nsockets_nb
>
sockets_nb
)
{
if
(
bit_test
(
job_res
->
core_bitmap
,
c
++
))
{
sockets_nb
=
nsockets_nb
;
core_cnt
++
;
xrealloc
(
sockets_cpu_cnt
,
sockets_nb
*
sizeof
(
int
));
if
(
cpus
<
vpus
)
xrealloc
(
sockets_used
,
sockets_nb
*
sizeof
(
bool
));
cpus
=
0
;
}
else
cpus
-=
vpus
;
/* count cores provided by each socket */
for
(
s
=
0
;
s
<
nsockets_nb
;
s
++
)
{
sockets_cpu_cnt
[
s
]
=
0
;
sockets_used
[
s
]
=
false
;
for
(
j
=
c
+
(
s
*
ncores_nb
)
;
j
<
c
+
((
s
+
1
)
*
ncores_nb
)
;
j
++
)
{
if
(
bit_test
(
job_res
->
core_bitmap
,
j
)
)
sockets_cpu_cnt
[
s
]
++
;
}
}
num_bits
--
;
}
}
/* select cores in the sockets using a best-fit approach */
while
(
cpus
>
0
)
{
best_fit_cpus
=
0
;
best_fit_sufficient
=
false
;
/* compute still required cores on the node */
req_cpus
=
cpus
/
vpus
;
if
(
cpus
%
vpus
)
req_cpus
++
;
/* search for the best socket, */
/* starting from the last one to let more room */
/* in the first one for system usage */
for
(
s
=
nsockets_nb
-
1
;
(
int
)
s
>=
(
int
)
0
;
s
--
)
{
sufficient
=
sockets_cpu_cnt
[
s
]
>=
req_cpus
;
if
(
(
best_fit_cpus
==
0
)
||
(
sufficient
&&
!
best_fit_sufficient
)
||
(
sufficient
&&
(
sockets_cpu_cnt
[
s
]
<
best_fit_cpus
))
||
(
!
sufficient
&&
(
sockets_cpu_cnt
[
s
]
>
best_fit_cpus
))
)
{
best_fit_cpus
=
sockets_cpu_cnt
[
s
];
best_fit_location
=
s
;
best_fit_sufficient
=
sufficient
;
}
}
/* check that we have found a usable socket */
if
(
best_fit_cpus
==
0
)
break
;
debug3
(
"dist_task: best_fit : using node[%lu]:"
"socket[%lu] : %u cores available"
,
n
,
best_fit_location
,
sockets_cpu_cnt
[
best_fit_location
]);
/* select socket cores from last to first */
/* socket[0]:Core[0] would be the last one */
sockets_used
[
best_fit_location
]
=
true
;
for
(
j
=
c
+
((
best_fit_location
+
1
)
*
ncores_nb
)
-
1
;
(
int
)
j
>=
(
int
)
(
c
+
(
best_fit_location
*
ncores_nb
))
;
j
--
)
{
/*
* if no more cpus to select
* release remaining cores unless
* we are allocating whole sockets
*/
if
(
cpus
==
0
&&
alloc_sockets
)
{
if
(
bit_test
(
job_res
->
core_bitmap
,
j
)
)
core_cnt
++
;
continue
;
}
else
if
(
cpus
==
0
)
{
bit_clear
(
job_res
->
core_bitmap
,
j
);
continue
;
}
/*
* remove cores from socket count and
* cpus count using hyperthreading requirement
*/
if
(
bit_test
(
job_res
->
core_bitmap
,
j
)
)
{
sockets_cpu_cnt
[
best_fit_location
]
--
;
core_cnt
++
;
if
(
cpus
<
vpus
)
cpus
=
0
;
else
cpus
-=
vpus
;
}
}
/* loop again if more cpus required */
if
(
cpus
>
0
)
continue
;
/* release remaining cores of the unused sockets */
for
(
s
=
0
;
s
<
nsockets_nb
;
s
++
)
{
if
(
sockets_used
[
s
]
)
continue
;
bit_nclear
(
job_res
->
core_bitmap
,
c
+
(
s
*
ncores_nb
),
c
+
((
s
+
1
)
*
ncores_nb
)
-
1
);
}
}
if
(
cpus
>
0
)
if
(
cpus
>
0
)
/* cpu count should NEVER be greater than the number
/* cpu count should NEVER be greater than the number
* of set bits in the core bitmap for a given node */
* of set bits in the core bitmap for a given node */
fatal
(
"cons_res: cpus computation error"
);
fatal
(
"cons_res: cpus computation error"
);
if
(
alloc_sockets
)
{
/* Advance to end of socket */
/* adjust cpus count of the current node */
while
((
num_bits
>
0
)
&&
(
c
%
select_node_record
[
n
].
cores
))
{
if
(
bit_test
(
job_res
->
core_bitmap
,
c
++
))
core_cnt
++
;
num_bits
--
;
}
}
while
(
num_bits
>
0
)
{
bit_clear
(
job_res
->
core_bitmap
,
c
++
);
num_bits
--
;
}
if
((
alloc_cores
||
alloc_sockets
)
&&
if
((
alloc_cores
||
alloc_sockets
)
&&
(
select_node_record
[
n
].
vpus
>
1
))
{
(
select_node_record
[
n
].
vpus
>
1
))
{
job_res
->
cpus
[
i
]
=
core_cnt
*
job_res
->
cpus
[
i
]
=
core_cnt
*
select_node_record
[
n
].
vpus
;
select_node_record
[
n
].
vpus
;
}
}
i
++
;
i
++
;
/* move c to the next node in core_bitmap */
c
+=
num_bits
;
}
}
xfree
(
sockets_cpu_cnt
);
xfree
(
sockets_used
);
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment