Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
bdd58a4c
Commit
bdd58a4c
authored
14 years ago
by
Moe Jette
Browse files
Options
Downloads
Patches
Plain Diff
this reverts some earlier changes that appear to cause problems, still not sure why...
parent
fa42fc5d
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/slurmctld/gang.c
+62
-32
62 additions, 32 deletions
src/slurmctld/gang.c
with
62 additions
and
32 deletions
src/slurmctld/gang.c
+
62
−
32
View file @
bdd58a4c
...
@@ -146,9 +146,11 @@ static uint32_t gs_debug_flags = 0;
...
@@ -146,9 +146,11 @@ static uint32_t gs_debug_flags = 0;
static
uint16_t
gs_fast_schedule
=
0
;
static
uint16_t
gs_fast_schedule
=
0
;
static
struct
gs_part
*
gs_part_list
=
NULL
;
static
struct
gs_part
*
gs_part_list
=
NULL
;
static
uint32_t
default_job_list_size
=
64
;
static
uint32_t
default_job_list_size
=
64
;
static
uint32_t
gs_resmap_size
=
0
;
static
pthread_mutex_t
data_mutex
=
PTHREAD_MUTEX_INITIALIZER
;
static
pthread_mutex_t
data_mutex
=
PTHREAD_MUTEX_INITIALIZER
;
static
uint16_t
*
gs_cpus_per_node
=
NULL
;
static
uint16_t
*
gs_bits_per_node
=
NULL
;
static
uint32_t
*
gs_bit_rep_count
=
NULL
;
static
struct
gs_part
**
gs_part_sorted
=
NULL
;
static
struct
gs_part
**
gs_part_sorted
=
NULL
;
static
uint32_t
num_sorted_part
=
0
;
static
uint32_t
num_sorted_part
=
0
;
...
@@ -187,7 +189,7 @@ static void _print_jobs(struct gs_part *p_ptr)
...
@@ -187,7 +189,7 @@ static void _print_jobs(struct gs_part *p_ptr)
if
(
gs_debug_flags
&
DEBUG_FLAG_GANG
)
{
if
(
gs_debug_flags
&
DEBUG_FLAG_GANG
)
{
info
(
"gang: part %s has %u jobs, %u shadows:"
,
info
(
"gang: part %s has %u jobs, %u shadows:"
,
p_ptr
->
part_name
,
p_ptr
->
num_jobs
,
p_ptr
->
num_shadows
);
p_ptr
->
part_name
,
p_ptr
->
num_jobs
,
p_ptr
->
num_shadows
);
for
(
i
=
0
;
i
<
p_ptr
->
num_shadows
;
i
++
)
{
for
(
i
=
0
;
i
<
p_ptr
->
num_shadows
;
i
++
)
{
info
(
"gang: shadow job %u row_s %s, sig_s %s"
,
info
(
"gang: shadow job %u row_s %s, sig_s %s"
,
p_ptr
->
shadow
[
i
]
->
job_ptr
->
job_id
,
p_ptr
->
shadow
[
i
]
->
job_ptr
->
job_id
,
...
@@ -204,7 +206,7 @@ static void _print_jobs(struct gs_part *p_ptr)
...
@@ -204,7 +206,7 @@ static void _print_jobs(struct gs_part *p_ptr)
int
s
=
bit_size
(
p_ptr
->
active_resmap
);
int
s
=
bit_size
(
p_ptr
->
active_resmap
);
i
=
bit_set_count
(
p_ptr
->
active_resmap
);
i
=
bit_set_count
(
p_ptr
->
active_resmap
);
info
(
"gang: active resmap has %d of %d bits set"
,
info
(
"gang: active resmap has %d of %d bits set"
,
i
,
s
);
i
,
s
);
}
}
}
}
}
}
...
@@ -223,23 +225,28 @@ static uint16_t _get_gr_type(void)
...
@@ -223,23 +225,28 @@ static uint16_t _get_gr_type(void)
return
GS_NODE
;
return
GS_NODE
;
}
}
/* For GS_CPU gs_cpus_per_node is the total number of CPUs per node.
/* For GS_CPU gs_bits_per_node is the total number of CPUs per node.
* For GS_CORE and GS_SOCKET gs_cpus_per_node is the total number of
* For GS_CORE and GS_SOCKET gs_bits_per_node is the total number of
* cores per node.
* cores per per node.
* This function also sets gs_resmap_size;
*/
*/
static
void
_load_phys_res_cnt
(
void
)
static
void
_load_phys_res_cnt
(
void
)
{
{
uint16_t
bit
=
0
,
sock
=
0
;
uint16_t
bit
=
0
,
sock
=
0
;
uint32_t
i
;
uint32_t
i
,
bit_index
=
0
;
struct
node_record
*
node_ptr
;
struct
node_record
*
node_ptr
;
xfree
(
gs_cpus_per_node
);
xfree
(
gs_bits_per_node
);
xfree
(
gs_bit_rep_count
);
if
((
gr_type
!=
GS_CPU
)
&&
(
gr_type
!=
GS_CORE
)
&&
if
((
gr_type
!=
GS_CPU
)
&&
(
gr_type
!=
GS_CORE
)
&&
(
gr_type
!=
GS_SOCKET
))
(
gr_type
!=
GS_SOCKET
))
return
;
return
;
gs_cpus_per_node
=
xmalloc
(
node_record_count
*
sizeof
(
uint16_t
));
gs_bits_per_node
=
xmalloc
(
node_record_count
*
sizeof
(
uint16_t
));
gs_bit_rep_count
=
xmalloc
(
node_record_count
*
sizeof
(
uint32_t
));
gs_resmap_size
=
0
;
for
(
i
=
0
,
node_ptr
=
node_record_table_ptr
;
i
<
node_record_count
;
for
(
i
=
0
,
node_ptr
=
node_record_table_ptr
;
i
<
node_record_count
;
i
++
,
node_ptr
++
)
{
i
++
,
node_ptr
++
)
{
if
(
gr_type
==
GS_CPU
)
{
if
(
gr_type
==
GS_CPU
)
{
...
@@ -256,7 +263,25 @@ static void _load_phys_res_cnt(void)
...
@@ -256,7 +263,25 @@ static void _load_phys_res_cnt(void)
bit
=
node_ptr
->
cores
*
sock
;
bit
=
node_ptr
->
cores
*
sock
;
}
}
}
}
gs_cpus_per_node
[
i
]
=
bit
;
gs_resmap_size
+=
bit
;
if
(
gs_bits_per_node
[
bit_index
]
!=
bit
)
{
if
(
gs_bit_rep_count
[
bit_index
]
>
0
)
bit_index
++
;
gs_bits_per_node
[
bit_index
]
=
bit
;
}
gs_bit_rep_count
[
bit_index
]
++
;
}
/* arrays must have trailing 0 */
bit_index
+=
2
;
xrealloc
(
gs_bits_per_node
,
bit_index
*
sizeof
(
uint16_t
));
xrealloc
(
gs_bit_rep_count
,
bit_index
*
sizeof
(
uint32_t
));
bit_index
--
;
for
(
i
=
0
;
i
<
bit_index
;
i
++
)
{
debug3
(
"gang: _load_phys_res_cnt: grp %d bits %u reps %u"
,
i
,
gs_bits_per_node
[
i
],
gs_bit_rep_count
[
i
]);
}
}
}
}
...
@@ -302,8 +327,9 @@ static void _destroy_parts(void)
...
@@ -302,8 +327,9 @@ static void _destroy_parts(void)
ptr
=
ptr
->
next
;
ptr
=
ptr
->
next
;
xfree
(
tmp
->
part_name
);
xfree
(
tmp
->
part_name
);
for
(
i
=
0
;
i
<
tmp
->
num_jobs
;
i
++
)
for
(
i
=
0
;
i
<
tmp
->
num_jobs
;
i
++
)
{
xfree
(
tmp
->
job_list
[
i
]);
xfree
(
tmp
->
job_list
[
i
]);
}
xfree
(
tmp
->
shadow
);
xfree
(
tmp
->
shadow
);
FREE_NULL_BITMAP
(
tmp
->
active_resmap
);
FREE_NULL_BITMAP
(
tmp
->
active_resmap
);
xfree
(
tmp
->
active_cpus
);
xfree
(
tmp
->
active_cpus
);
...
@@ -411,7 +437,8 @@ static int _job_fits_in_active_row(struct job_record *job_ptr,
...
@@ -411,7 +437,8 @@ static int _job_fits_in_active_row(struct job_record *job_ptr,
if
((
gr_type
==
GS_CORE
)
||
(
gr_type
==
GS_SOCKET
))
{
if
((
gr_type
==
GS_CORE
)
||
(
gr_type
==
GS_SOCKET
))
{
return
job_fits_into_cores
(
job_res
,
p_ptr
->
active_resmap
,
return
job_fits_into_cores
(
job_res
,
p_ptr
->
active_resmap
,
gs_cpus_per_node
,
NULL
);
gs_bits_per_node
,
gs_bit_rep_count
);
}
}
/* gr_type == GS_NODE || gr_type == GS_CPU */
/* gr_type == GS_NODE || gr_type == GS_CPU */
...
@@ -490,14 +517,14 @@ static void _add_job_to_active(struct job_record *job_ptr,
...
@@ -490,14 +517,14 @@ static void _add_job_to_active(struct job_record *job_ptr,
bit_nclear
(
p_ptr
->
active_resmap
,
0
,
size
-
1
);
bit_nclear
(
p_ptr
->
active_resmap
,
0
,
size
-
1
);
}
}
add_job_to_cores
(
job_res
,
&
(
p_ptr
->
active_resmap
),
add_job_to_cores
(
job_res
,
&
(
p_ptr
->
active_resmap
),
gs_cpu
s_per_node
,
NULL
);
gs_bit
s_per_node
,
gs_bit_rep_count
);
if
(
gr_type
==
GS_SOCKET
)
if
(
gr_type
==
GS_SOCKET
)
_fill_sockets
(
job_res
->
node_bitmap
,
p_ptr
);
_fill_sockets
(
job_res
->
node_bitmap
,
p_ptr
);
}
else
{
}
else
{
/* GS_NODE or GS_CPU */
/* GS_NODE or GS_CPU */
if
(
!
p_ptr
->
active_resmap
)
{
if
(
!
p_ptr
->
active_resmap
)
{
debug3
(
"gang: _add_job_to_active: job %u first"
,
debug3
(
"gang: _add_job_to_active: job %u first"
,
job_ptr
->
job_id
);
job_ptr
->
job_id
);
p_ptr
->
active_resmap
=
bit_copy
(
job_res
->
node_bitmap
);
p_ptr
->
active_resmap
=
bit_copy
(
job_res
->
node_bitmap
);
}
else
if
(
p_ptr
->
jobs_active
==
0
)
{
}
else
if
(
p_ptr
->
jobs_active
==
0
)
{
debug3
(
"gang: _add_job_to_active: job %u copied"
,
debug3
(
"gang: _add_job_to_active: job %u copied"
,
...
@@ -552,16 +579,16 @@ static int _suspend_job(uint32_t job_id)
...
@@ -552,16 +579,16 @@ static int _suspend_job(uint32_t job_id)
suspend_msg_t
msg
;
suspend_msg_t
msg
;
msg
.
job_id
=
job_id
;
msg
.
job_id
=
job_id
;
if
(
gs_debug_flags
&
DEBUG_FLAG_GANG
)
info
(
"gang: suspending %u"
,
job_id
);
else
debug
(
"gang: suspending %u"
,
job_id
);
msg
.
op
=
SUSPEND_JOB
;
msg
.
op
=
SUSPEND_JOB
;
rc
=
job_suspend
(
&
msg
,
0
,
-
1
,
false
,
(
uint16_t
)
NO_VAL
);
rc
=
job_suspend
(
&
msg
,
0
,
-
1
,
false
,
(
uint16_t
)
NO_VAL
);
/* job_suspend() returns ESLURM_DISABLED if job is already suspended */
/* job_suspend() returns ESLURM_DISABLED if job is already suspended */
if
((
rc
!=
SLURM_SUCCESS
)
&&
(
rc
!=
ESLURM_DISABLED
))
{
if
(
rc
==
SLURM_SUCCESS
)
{
info
(
"gang: suspending job %u: %s"
,
if
(
gs_debug_flags
&
DEBUG_FLAG_GANG
)
job_id
,
slurm_strerror
(
rc
));
info
(
"gang: suspending %u"
,
job_id
);
else
debug
(
"gang: suspending %u"
,
job_id
);
}
else
if
(
rc
!=
ESLURM_DISABLED
)
{
info
(
"gang: suspending job %u: %s"
,
job_id
,
slurm_strerror
(
rc
));
}
}
return
rc
;
return
rc
;
}
}
...
@@ -572,15 +599,15 @@ static void _resume_job(uint32_t job_id)
...
@@ -572,15 +599,15 @@ static void _resume_job(uint32_t job_id)
suspend_msg_t
msg
;
suspend_msg_t
msg
;
msg
.
job_id
=
job_id
;
msg
.
job_id
=
job_id
;
if
(
gs_debug_flags
&
DEBUG_FLAG_GANG
)
info
(
"gang: resuming %u"
,
job_id
);
else
debug
(
"gang: resuming %u"
,
job_id
);
msg
.
op
=
RESUME_JOB
;
msg
.
op
=
RESUME_JOB
;
rc
=
job_suspend
(
&
msg
,
0
,
-
1
,
false
,
(
uint16_t
)
NO_VAL
);
rc
=
job_suspend
(
&
msg
,
0
,
-
1
,
false
,
(
uint16_t
)
NO_VAL
);
if
((
rc
!=
SLURM_SUCCESS
)
&&
(
rc
!=
ESLURM_ALREADY_DONE
))
{
if
(
rc
==
SLURM_SUCCESS
)
{
error
(
"gang: resuming job %u: %s"
,
if
(
gs_debug_flags
&
DEBUG_FLAG_GANG
)
job_id
,
slurm_strerror
(
rc
));
info
(
"gang: resuming %u"
,
job_id
);
else
debug
(
"gang: resuming %u"
,
job_id
);
}
else
if
(
rc
!=
ESLURM_ALREADY_DONE
)
{
error
(
"gang: resuming job %u: %s"
,
job_id
,
slurm_strerror
(
rc
));
}
}
}
}
...
@@ -1132,7 +1159,8 @@ extern int gs_fini(void)
...
@@ -1132,7 +1159,8 @@ extern int gs_fini(void)
_destroy_parts
();
_destroy_parts
();
xfree
(
gs_part_sorted
);
xfree
(
gs_part_sorted
);
gs_part_sorted
=
NULL
;
gs_part_sorted
=
NULL
;
xfree
(
gs_cpus_per_node
);
xfree
(
gs_bits_per_node
);
xfree
(
gs_bit_rep_count
);
pthread_mutex_unlock
(
&
data_mutex
);
pthread_mutex_unlock
(
&
data_mutex
);
debug3
(
"gang: leaving gs_fini"
);
debug3
(
"gang: leaving gs_fini"
);
...
@@ -1239,8 +1267,10 @@ extern int gs_job_fini(struct job_record *job_ptr)
...
@@ -1239,8 +1267,10 @@ extern int gs_job_fini(struct job_record *job_ptr)
* - nodes can be removed from a partition, or added to a partition
* - nodes can be removed from a partition, or added to a partition
* - this affects the size of the active resmap
* - this affects the size of the active resmap
*
*
* If nodes have been added or removed, we need to resize the existing resmaps
* If nodes have been added or removed, then the node_record_count
* to prevent errors when comparing them.
* will be different from gs_resmap_size. In this case, we need
* to resize the existing resmaps to prevent errors when comparing
* them.
*
*
* Here's the plan:
* Here's the plan:
* 1. save a copy of the global structures, and then construct
* 1. save a copy of the global structures, and then construct
...
@@ -1323,7 +1353,7 @@ extern int gs_reconfig(void)
...
@@ -1323,7 +1353,7 @@ extern int gs_reconfig(void)
if
(
IS_JOB_SUSPENDED
(
job_ptr
)
&&
if
(
IS_JOB_SUSPENDED
(
job_ptr
)
&&
(
job_ptr
->
priority
!=
0
))
{
(
job_ptr
->
priority
!=
0
))
{
debug3
(
"resuming job %u apparently suspended "
debug3
(
"resuming job %u apparently suspended "
"by gang"
,
job_ptr
->
job_id
);
"
by gang"
,
job_ptr
->
job_id
);
_resume_job
(
job_ptr
->
job_id
);
_resume_job
(
job_ptr
->
job_id
);
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment