Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
9b5facf4
Commit
9b5facf4
authored
14 years ago
by
Moe Jette
Browse files
Options
Downloads
Patches
Plain Diff
streamline gres job state save/restore logic
parent
9802ef0e
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/common/gres.c
+60
-204
60 additions, 204 deletions
src/common/gres.c
with
60 additions
and
204 deletions
src/common/gres.c
+
60
−
204
View file @
9b5facf4
...
...
@@ -100,7 +100,6 @@ typedef struct slurm_gres_context {
slurm_gres_ops_t
ops
;
/* pointers to plugin symbols */
uint32_t
plugin_id
;
/* key for searches */
plugrack_t
plugin_list
;
/* plugrack info */
bool
unpacked_info
;
/* info unpacked */
}
slurm_gres_context_t
;
/* Generic gres data structure for adding to a list. Depending upon the
...
...
@@ -135,9 +134,6 @@ static int _job_config_validate(char *config, uint32_t *gres_cnt,
slurm_gres_context_t
*
context_ptr
);
static
void
_job_state_delete
(
void
*
gres_data
);
static
void
*
_job_state_dup
(
void
*
gres_data
);
static
int
_job_state_pack
(
void
*
gres_data
,
Buf
buffer
);
static
int
_job_state_unpack
(
void
**
gres_data
,
Buf
buffer
,
char
*
gres_name
);
static
int
_job_state_validate
(
char
*
config
,
void
**
gres_data
,
slurm_gres_context_t
*
gres_name
);
extern
uint32_t
_job_test
(
void
*
job_gres_data
,
void
*
node_gres_data
,
...
...
@@ -837,8 +833,6 @@ extern int gres_plugin_node_config_unpack(Buf buffer, char* node_name)
return
SLURM_SUCCESS
;
slurm_mutex_lock
(
&
gres_context_lock
);
for
(
j
=
0
;
j
<
gres_context_cnt
;
j
++
)
gres_context
[
j
].
unpacked_info
=
false
;
for
(
i
=
0
;
i
<
rec_cnt
;
i
++
)
{
safe_unpack32
(
&
magic
,
buffer
);
if
(
magic
!=
GRES_MAGIC
)
...
...
@@ -876,7 +870,6 @@ extern int gres_plugin_node_config_unpack(Buf buffer, char* node_name)
count
=
1024
;
}
gres_context
[
j
].
has_file
=
has_file
;
gres_context
[
j
].
unpacked_info
=
true
;
break
;
}
if
(
j
>=
gres_context_cnt
)
{
...
...
@@ -899,14 +892,6 @@ extern int gres_plugin_node_config_unpack(Buf buffer, char* node_name)
p
->
plugin_id
=
plugin_id
;
list_append
(
gres_conf_list
,
p
);
}
for
(
j
=
0
;
j
<
gres_context_cnt
;
j
++
)
{
if
(
gres_context
[
j
].
unpacked_info
)
continue
;
/* A likely sign GresPlugins is inconsistently configured. */
error
(
"gres_plugin_node_config_unpack: no data type of type "
"%s from node %s"
,
gres_context
[
j
].
gres_type
,
node_name
);
}
slurm_mutex_unlock
(
&
gres_context_lock
);
return
rc
;
...
...
@@ -1575,9 +1560,6 @@ extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer,
fatal
(
"list_create malloc failure"
);
}
for
(
i
=
0
;
i
<
gres_context_cnt
;
i
++
)
gres_context
[
i
].
unpacked_info
=
false
;
while
((
rc
==
SLURM_SUCCESS
)
&&
(
rec_cnt
))
{
if
((
buffer
==
NULL
)
||
(
remaining_buf
(
buffer
)
==
0
))
break
;
...
...
@@ -1600,7 +1582,6 @@ extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer,
* Not a fatal error, skip over the data. */
continue
;
}
gres_context
[
i
].
unpacked_info
=
true
;
gres_node_ptr
=
_build_gres_node_state
();
gres_node_ptr
->
gres_cnt_avail
=
gres_cnt_avail
;
if
(
has_bitmap
)
{
...
...
@@ -1614,30 +1595,14 @@ extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer,
gres_ptr
->
gres_data
=
gres_node_ptr
;
list_append
(
*
gres_list
,
gres_ptr
);
}
fini:
/* Insure that every gres plugin is called for unpack, even if no data
* was packed by the node. A likely sign that GresPlugins is
* inconsistently configured. */
for
(
i
=
0
;
i
<
gres_context_cnt
;
i
++
)
{
if
(
gres_context
[
i
].
unpacked_info
)
continue
;
error
(
"gres_plugin_node_state_unpack: no info packed for %s "
"by node %s"
,
gres_context
[
i
].
gres_type
,
node_name
);
gres_node_ptr
=
_build_gres_node_state
();
gres_ptr
=
xmalloc
(
sizeof
(
gres_state_t
));
gres_ptr
->
plugin_id
=
gres_context
[
i
].
plugin_id
;
gres_ptr
->
gres_data
=
gres_node_ptr
;
list_append
(
*
gres_list
,
gres_ptr
);
}
slurm_mutex_unlock
(
&
gres_context_lock
);
return
rc
;
unpack_error:
error
(
"gres_plugin_node_state_unpack: unpack error from node %s"
,
node_name
);
rc
=
SLURM_ERROR
;
goto
fini
;
slurm_mutex_unlock
(
&
gres_context_lock
)
;
return
SLURM_ERROR
;
}
static
void
*
_node_state_dup
(
void
*
gres_data
)
...
...
@@ -2142,7 +2107,6 @@ static void *_job_state_dup(void *gres_data)
*/
List
gres_plugin_job_state_dup
(
List
gres_list
)
{
int
i
;
ListIterator
gres_iter
;
gres_state_t
*
gres_ptr
,
*
new_gres_state
;
List
new_gres_list
=
NULL
;
...
...
@@ -2156,27 +2120,18 @@ List gres_plugin_job_state_dup(List gres_list)
slurm_mutex_lock
(
&
gres_context_lock
);
gres_iter
=
list_iterator_create
(
gres_list
);
while
((
gres_ptr
=
(
gres_state_t
*
)
list_next
(
gres_iter
)))
{
for
(
i
=
0
;
i
<
gres_context_cnt
;
i
++
)
{
if
(
gres_ptr
->
plugin_id
!=
gres_context
[
i
].
plugin_id
)
continue
;
new_gres_data
=
_job_state_dup
(
gres_ptr
->
gres_data
);
if
(
new_gres_data
==
NULL
)
break
;
if
(
new_gres_list
==
NULL
)
{
new_gres_list
=
list_create
(
_gres_job_list_delete
);
if
(
new_gres_list
==
NULL
)
fatal
(
"list_create: malloc failure"
);
}
new_gres_state
=
xmalloc
(
sizeof
(
gres_state_t
));
new_gres_state
->
plugin_id
=
gres_ptr
->
plugin_id
;
new_gres_state
->
gres_data
=
new_gres_data
;
list_append
(
new_gres_list
,
new_gres_state
);
new_gres_data
=
_job_state_dup
(
gres_ptr
->
gres_data
);
if
(
new_gres_data
==
NULL
)
break
;
if
(
new_gres_list
==
NULL
)
{
new_gres_list
=
list_create
(
_gres_job_list_delete
);
if
(
new_gres_list
==
NULL
)
fatal
(
"list_create: malloc failure"
);
}
if
(
i
>=
gres_context_cnt
)
{
error
(
"Could not find
plugin
id
%u to dup job record"
,
gres_ptr
->
plugin_id
)
;
}
new_gres_state
=
xmalloc
(
sizeof
(
gres_state_t
));
new_gres_state
->
plugin
_
id
=
gres_ptr
->
plugin_id
;
new_gres_state
->
gres_data
=
new_gres_data
;
list_append
(
new_gres_list
,
new_gres_state
);
}
list_iterator_destroy
(
gres_iter
);
slurm_mutex_unlock
(
&
gres_context_lock
);
...
...
@@ -2184,19 +2139,6 @@ List gres_plugin_job_state_dup(List gres_list)
return
new_gres_list
;
}
static
int
_job_state_pack
(
void
*
gres_data
,
Buf
buffer
)
{
int
i
;
gres_job_state_t
*
gres_ptr
=
(
gres_job_state_t
*
)
gres_data
;
pack32
(
gres_ptr
->
gres_cnt_alloc
,
buffer
);
pack32
(
gres_ptr
->
node_cnt
,
buffer
);
for
(
i
=
0
;
i
<
gres_ptr
->
node_cnt
;
i
++
)
pack_bit_str
(
gres_ptr
->
gres_bit_alloc
[
i
],
buffer
);
return
SLURM_SUCCESS
;
}
/*
* Pack a job's current gres status, called from slurmctld for save/restore
* IN gres_list - generated by gres_plugin_job_config_validate()
...
...
@@ -2206,13 +2148,14 @@ static int _job_state_pack(void *gres_data, Buf buffer)
extern
int
gres_plugin_job_state_pack
(
List
gres_list
,
Buf
buffer
,
uint32_t
job_id
)
{
int
i
,
rc
=
SLURM_SUCCESS
,
rc2
;
uint32_t
top_offset
,
gres_size
=
0
;
uint32_t
header_offset
,
size_offset
,
data_offset
,
tail_offset
;
int
i
,
rc
=
SLURM_SUCCESS
;
uint32_t
top_offset
,
tail_offset
;
uint32_t
magic
=
GRES_MAGIC
;
uint16_t
rec_cnt
=
0
;
uint8_t
has_bitmap
;
ListIterator
gres_iter
;
gres_state_t
*
gres_ptr
;
gres_job_state_t
*
gres_job_ptr
;
top_offset
=
get_buf_offset
(
buffer
);
pack16
(
rec_cnt
,
buffer
);
/* placeholder if data */
...
...
@@ -2225,35 +2168,23 @@ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer,
slurm_mutex_lock
(
&
gres_context_lock
);
gres_iter
=
list_iterator_create
(
gres_list
);
while
((
gres_ptr
=
(
gres_state_t
*
)
list_next
(
gres_iter
)))
{
for
(
i
=
0
;
i
<
gres_context_cnt
;
i
++
)
{
if
(
gres_ptr
->
plugin_id
!=
gres_context
[
i
].
plugin_id
)
continue
;
header_offset
=
get_buf_offset
(
buffer
);
pack32
(
magic
,
buffer
);
pack32
(
gres_ptr
->
plugin_id
,
buffer
);
size_offset
=
get_buf_offset
(
buffer
);
pack32
(
gres_size
,
buffer
);
/* placeholder */
data_offset
=
get_buf_offset
(
buffer
);
rc2
=
_job_state_pack
(
gres_ptr
->
gres_data
,
buffer
);
if
(
rc2
!=
SLURM_SUCCESS
)
{
rc
=
rc2
;
set_buf_offset
(
buffer
,
header_offset
);
continue
;
gres_job_ptr
=
(
gres_job_state_t
*
)
gres_ptr
->
gres_data
;
pack32
(
magic
,
buffer
);
pack32
(
gres_ptr
->
plugin_id
,
buffer
);
pack32
(
gres_job_ptr
->
gres_cnt_alloc
,
buffer
);
pack32
(
gres_job_ptr
->
node_cnt
,
buffer
);
if
(
gres_job_ptr
->
gres_bit_alloc
)
{
has_bitmap
=
1
;
pack8
(
has_bitmap
,
buffer
);
for
(
i
=
0
;
i
<
gres_job_ptr
->
node_cnt
;
i
++
)
{
pack_bit_str
(
gres_job_ptr
->
gres_bit_alloc
[
i
],
buffer
);
}
tail_offset
=
get_buf_offset
(
buffer
);
set_buf_offset
(
buffer
,
size_offset
);
gres_size
=
tail_offset
-
data_offset
;
pack32
(
gres_size
,
buffer
);
set_buf_offset
(
buffer
,
tail_offset
);
rec_cnt
++
;
break
;
}
if
(
i
>=
gres_context_cnt
)
{
error
(
"Could not find plugin id %u to pack record for "
"job %u"
,
gres_ptr
->
plugin_id
,
job_id
);
}
else
{
has_bitmap
=
0
;
pack8
(
has_bitmap
,
buffer
);
}
rec_cnt
++
;
}
list_iterator_destroy
(
gres_iter
);
slurm_mutex_unlock
(
&
gres_context_lock
);
...
...
@@ -2266,37 +2197,6 @@ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer,
return
rc
;
}
static
int
_job_state_unpack
(
void
**
gres_data
,
Buf
buffer
,
char
*
gres_name
)
{
int
i
;
gres_job_state_t
*
gres_ptr
;
gres_ptr
=
xmalloc
(
sizeof
(
gres_job_state_t
));
if
(
buffer
)
{
safe_unpack32
(
&
gres_ptr
->
gres_cnt_alloc
,
buffer
);
safe_unpack32
(
&
gres_ptr
->
node_cnt
,
buffer
);
gres_ptr
->
gres_bit_alloc
=
xmalloc
(
sizeof
(
bitstr_t
*
)
*
gres_ptr
->
node_cnt
);
for
(
i
=
0
;
i
<
gres_ptr
->
node_cnt
;
i
++
)
unpack_bit_str
(
&
gres_ptr
->
gres_bit_alloc
[
i
],
buffer
);
}
*
gres_data
=
gres_ptr
;
return
SLURM_SUCCESS
;
unpack_error:
error
(
"Unpacking gres/%s job state info"
,
gres_name
);
if
(
gres_ptr
->
gres_bit_alloc
)
{
for
(
i
=
0
;
i
<
gres_ptr
->
node_cnt
;
i
++
)
FREE_NULL_BITMAP
(
gres_ptr
->
gres_bit_alloc
[
i
]);
xfree
(
gres_ptr
->
gres_bit_alloc
);
}
xfree
(
gres_ptr
);
*
gres_data
=
NULL
;
return
SLURM_ERROR
;
}
/*
* Unpack a job's current gres status, called from slurmctld for save/restore
* OUT gres_list - restored state stored by gres_plugin_job_state_pack()
...
...
@@ -2306,11 +2206,12 @@ unpack_error:
extern
int
gres_plugin_job_state_unpack
(
List
*
gres_list
,
Buf
buffer
,
uint32_t
job_id
)
{
int
i
,
rc
,
rc2
;
uint32_t
gres_size
,
magic
,
tail_offset
,
plugin_id
;
int
i
,
rc
;
uint32_t
magic
,
plugin_id
;
uint16_t
rec_cnt
;
uint8_t
has_bitmap
;
gres_state_t
*
gres_ptr
;
void
*
gres_data
;
gres_job_state_t
*
gres_job_ptr
=
NULL
;
safe_unpack16
(
&
rec_cnt
,
buffer
);
if
(
rec_cnt
==
0
)
...
...
@@ -2325,9 +2226,6 @@ extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer,
fatal
(
"list_create malloc failure"
);
}
for
(
i
=
0
;
i
<
gres_context_cnt
;
i
++
)
gres_context
[
i
].
unpacked_info
=
false
;
while
((
rc
==
SLURM_SUCCESS
)
&&
(
rec_cnt
))
{
if
((
buffer
==
NULL
)
||
(
remaining_buf
(
buffer
)
==
0
))
break
;
...
...
@@ -2336,7 +2234,19 @@ extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer,
if
(
magic
!=
GRES_MAGIC
)
goto
unpack_error
;
safe_unpack32
(
&
plugin_id
,
buffer
);
safe_unpack32
(
&
gres_size
,
buffer
);
gres_job_ptr
=
xmalloc
(
sizeof
(
gres_job_state_t
));
safe_unpack32
(
&
gres_job_ptr
->
gres_cnt_alloc
,
buffer
);
safe_unpack32
(
&
gres_job_ptr
->
node_cnt
,
buffer
);
safe_unpack8
(
&
has_bitmap
,
buffer
);
if
(
has_bitmap
)
{
gres_job_ptr
->
gres_bit_alloc
=
xmalloc
(
sizeof
(
bitstr_t
*
)
*
gres_job_ptr
->
node_cnt
);
for
(
i
=
0
;
i
<
gres_job_ptr
->
node_cnt
;
i
++
)
{
unpack_bit_str
(
&
gres_job_ptr
->
gres_bit_alloc
[
i
],
buffer
);
}
}
for
(
i
=
0
;
i
<
gres_context_cnt
;
i
++
)
{
if
(
gres_context
[
i
].
plugin_id
==
plugin_id
)
break
;
...
...
@@ -2347,53 +2257,25 @@ extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer,
plugin_id
,
job_id
);
/* A likely sign that GresPlugins has changed.
* Not a fatal error, skip over the data. */
tail_offset
=
get_buf_offset
(
buffer
);
tail_offset
+=
gres_size
;
set_buf_offset
(
buffer
,
tail_offset
);
_job_state_delete
(
gres_job_ptr
);
continue
;
}
gres_context
[
i
].
unpacked_info
=
true
;
rc2
=
_job_state_unpack
(
&
gres_data
,
buffer
,
gres_context
[
i
].
gres_name
);
if
(
rc2
!=
SLURM_SUCCESS
)
{
rc
=
rc2
;
}
else
{
gres_ptr
=
xmalloc
(
sizeof
(
gres_state_t
));
gres_ptr
->
plugin_id
=
gres_context
[
i
].
plugin_id
;
gres_ptr
->
gres_data
=
gres_data
;
list_append
(
*
gres_list
,
gres_ptr
);
}
}
fini:
/* Insure that every gres plugin is called for unpack, even if no data
* was packed by the job. A likely sign that GresPlugins is
* inconsistently configured. */
for
(
i
=
0
;
i
<
gres_context_cnt
;
i
++
)
{
if
(
gres_context
[
i
].
unpacked_info
)
continue
;
debug
(
"gres_plugin_job_state_unpack: no info packed for %s "
"by job %u"
,
gres_context
[
i
].
gres_type
,
job_id
);
rc2
=
_job_state_unpack
(
&
gres_data
,
NULL
,
gres_context
[
i
].
gres_name
);
if
(
rc2
!=
SLURM_SUCCESS
)
{
rc
=
rc2
;
}
else
{
gres_ptr
=
xmalloc
(
sizeof
(
gres_state_t
));
gres_ptr
->
plugin_id
=
gres_context
[
i
].
plugin_id
;
gres_ptr
->
gres_data
=
gres_data
;
list_append
(
*
gres_list
,
gres_ptr
);
}
gres_ptr
=
xmalloc
(
sizeof
(
gres_state_t
));
gres_ptr
->
plugin_id
=
gres_context
[
i
].
plugin_id
;
gres_ptr
->
gres_data
=
gres_job_ptr
;
gres_job_ptr
=
NULL
;
/* nothing left to free on error */
list_append
(
*
gres_list
,
gres_ptr
);
}
slurm_mutex_unlock
(
&
gres_context_lock
);
return
rc
;
unpack_error:
error
(
"gres_plugin_job_state_unpack: unpack error from job %u"
,
job_id
);
rc
=
SLURM_ERROR
;
goto
fini
;
if
(
gres_job_ptr
)
_job_state_delete
(
gres_job_ptr
);
slurm_mutex_unlock
(
&
gres_context_lock
);
return
SLURM_ERROR
;
}
/* If CPU bitmap from slurmd differs in size from that in slurmctld,
...
...
@@ -3211,7 +3093,6 @@ static void *_step_state_dup(void *gres_data)
*/
List
gres_plugin_step_state_dup
(
List
gres_list
)
{
int
i
;
ListIterator
gres_iter
;
gres_state_t
*
gres_ptr
,
*
new_gres_state
;
List
new_gres_list
=
NULL
;
...
...
@@ -3393,9 +3274,6 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
fatal
(
"list_create malloc failure"
);
}
for
(
i
=
0
;
i
<
gres_context_cnt
;
i
++
)
gres_context
[
i
].
unpacked_info
=
false
;
while
((
rc
==
SLURM_SUCCESS
)
&&
(
rec_cnt
))
{
if
((
buffer
==
NULL
)
||
(
remaining_buf
(
buffer
)
==
0
))
break
;
...
...
@@ -3421,7 +3299,6 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
set_buf_offset
(
buffer
,
tail_offset
);
continue
;
}
gres_context
[
i
].
unpacked_info
=
true
;
rc2
=
_step_state_unpack
(
&
gres_data
,
buffer
,
gres_context
[
i
].
gres_name
);
if
(
rc2
!=
SLURM_SUCCESS
)
{
...
...
@@ -3433,27 +3310,6 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
list_append
(
*
gres_list
,
gres_ptr
);
}
}
fini:
/* Insure that every gres plugin is called for unpack, even if no data
* was packed by the job. A likely sign that GresPlugins is
* inconsistently configured. */
for
(
i
=
0
;
i
<
gres_context_cnt
;
i
++
)
{
if
(
gres_context
[
i
].
unpacked_info
)
continue
;
debug
(
"gres_plugin_job_state_unpack: no info packed for %s "
"by step %u.%u"
,
gres_context
[
i
].
gres_type
,
job_id
,
step_id
);
rc2
=
_step_state_unpack
(
&
gres_data
,
NULL
,
gres_context
[
i
].
gres_name
);
if
(
rc2
!=
SLURM_SUCCESS
)
{
rc
=
rc2
;
}
else
{
gres_ptr
=
xmalloc
(
sizeof
(
gres_state_t
));
gres_ptr
->
plugin_id
=
gres_context
[
i
].
plugin_id
;
gres_ptr
->
gres_data
=
gres_data
;
list_append
(
*
gres_list
,
gres_ptr
);
}
}
slurm_mutex_unlock
(
&
gres_context_lock
);
return
rc
;
...
...
@@ -3461,8 +3317,8 @@ fini: /* Insure that every gres plugin is called for unpack, even if no data
unpack_error:
error
(
"gres_plugin_job_state_unpack: unpack error from step %u.%u"
,
job_id
,
step_id
);
rc
=
SLURM_ERROR
;
goto
fini
;
slurm_mutex_unlock
(
&
gres_context_lock
)
;
return
SLURM_ERROR
;
}
static
void
_step_state_log
(
void
*
gres_data
,
uint32_t
job_id
,
uint32_t
step_id
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment