Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
7d6c3b77
Commit
7d6c3b77
authored
8 years ago
by
Danny Auble
Browse files
Options
Downloads
Patches
Plain Diff
Refactor the way fed_mgr state is loaded so we can actually use it
with real persistent connections.
parent
59934649
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/slurmctld/fed_mgr.c
+51
-45
51 additions, 45 deletions
src/slurmctld/fed_mgr.c
src/slurmctld/fed_mgr.h
+1
-1
1 addition, 1 deletion
src/slurmctld/fed_mgr.h
with
52 additions
and
46 deletions
src/slurmctld/fed_mgr.c
+
51
−
45
View file @
7d6c3b77
...
...
@@ -465,6 +465,7 @@ static void _join_federation(slurmdb_federation_rec_t *fed,
lock_slurmctld
(
fed_read_lock
);
_open_persist_sends
();
unlock_slurmctld
(
fed_read_lock
);
_create_ping_thread
();
}
extern
int
fed_mgr_init
(
void
*
db_conn
)
...
...
@@ -472,29 +473,42 @@ extern int fed_mgr_init(void *db_conn)
int
rc
=
SLURM_SUCCESS
;
slurmdb_federation_cond_t
fed_cond
;
List
fed_list
;
slurmdb_federation_rec_t
*
fed
=
NULL
;
if
(
running_cache
)
{
debug
(
"Database appears down, reading federations from state file."
);
fed_mgr_state_load
(
slurmctld_conf
.
state_save_location
);
return
SLURM_SUCCESS
;
}
slurm_persist_conn_recv_server_init
();
slurmdb_init_federation_cond
(
&
fed_cond
,
0
);
fed_cond
.
cluster_list
=
list_create
(
NULL
);
list_append
(
fed_cond
.
cluster_list
,
slurmctld_cluster_name
);
if
(
running_cache
)
{
debug
(
"Database appears down, reading federations from state file."
);
fed
=
fed_mgr_state_load
(
slurmctld_conf
.
state_save_location
);
if
(
!
fed
)
{
debug2
(
"No federation state"
);
return
SLURM_SUCCESS
;
}
}
else
{
slurmdb_init_federation_cond
(
&
fed_cond
,
0
);
fed_cond
.
cluster_list
=
list_create
(
NULL
);
list_append
(
fed_cond
.
cluster_list
,
slurmctld_cluster_name
);
fed_list
=
acct_storage_g_get_federations
(
db_conn
,
getuid
(),
&
fed_cond
);
FREE_NULL_LIST
(
fed_cond
.
cluster_list
);
if
(
!
fed_list
)
{
error
(
"failed to get a federation list"
);
return
SLURM_ERROR
;
}
fed_list
=
acct_storage_g_get_federations
(
db_conn
,
getuid
(),
&
fed_cond
);
FREE_NULL_LIST
(
fed_cond
.
cluster_list
);
if
(
!
fed_list
)
{
error
(
"failed to get a federation list"
);
return
SLURM_ERROR
;
if
(
list_count
(
fed_list
)
==
1
)
fed
=
list_pop
(
fed_list
);
else
if
(
list_count
(
fed_list
)
>
1
)
{
error
(
"got more federations than expected"
);
rc
=
SLURM_ERROR
;
}
FREE_NULL_LIST
(
fed_list
);
}
if
(
list_count
(
fed_list
)
==
1
)
{
if
(
fed
)
{
slurmdb_cluster_rec_t
*
cluster
=
NULL
;
slurmdb_federation_rec_t
*
fed
=
list_pop
(
fed_list
);
if
((
cluster
=
list_find_first
(
fed
->
cluster_list
,
slurmdb_find_cluster_in_list
,
...
...
@@ -504,14 +518,8 @@ extern int fed_mgr_init(void *db_conn)
error
(
"failed to get cluster from federation that we request"
);
rc
=
SLURM_ERROR
;
}
}
else
if
(
list_count
(
fed_list
)
>
1
)
{
error
(
"got more federations than expected"
);
rc
=
SLURM_ERROR
;
}
FREE_NULL_LIST
(
fed_list
);
_create_ping_thread
();
return
rc
;
}
...
...
@@ -657,7 +665,7 @@ extern int fed_mgr_state_save(char *state_save_location)
return
error_code
;
}
extern
in
t
fed_mgr_state_load
(
char
*
state_save_location
)
extern
slurmdb_federation_rec_
t
*
fed_mgr_state_load
(
char
*
state_save_location
)
{
Buf
buffer
=
NULL
;
char
*
data
=
NULL
,
*
state_file
;
...
...
@@ -666,8 +674,7 @@ extern int fed_mgr_state_load(char *state_save_location)
uint32_t
data_size
=
0
;
int
state_fd
;
int
data_allocated
,
data_read
=
0
,
error_code
=
SLURM_SUCCESS
;
slurmdb_cluster_rec_t
*
cluster
=
NULL
;
slurmdb_federation_rec_t
*
tmp_fed
=
NULL
;
slurmdb_federation_rec_t
*
ret_fed
=
NULL
;
state_file
=
xstrdup_printf
(
"%s/%s"
,
state_save_location
,
FED_MGR_STATE_FILE
);
...
...
@@ -675,7 +682,7 @@ extern int fed_mgr_state_load(char *state_save_location)
if
(
state_fd
<
0
)
{
error
(
"No fed_mgr state file (%s) to recover"
,
state_file
);
xfree
(
state_file
);
return
SLURM_SUCCESS
;
return
NULL
;
}
else
{
data_allocated
=
BUF_SIZE
;
data
=
xmalloc
(
data_allocated
);
...
...
@@ -712,41 +719,40 @@ extern int fed_mgr_state_load(char *state_save_location)
SLURM_MIN_PROTOCOL_VERSION
,
SLURM_PROTOCOL_VERSION
);
error
(
"***********************************************"
);
free_buf
(
buffer
);
return
EFA
UL
T
;
return
N
UL
L
;
}
safe_unpack_time
(
&
buf_time
,
buffer
);
error_code
=
slurmdb_unpack_federation_rec
((
void
**
)
&
tmp
_fed
,
ver
,
error_code
=
slurmdb_unpack_federation_rec
((
void
**
)
&
ret
_fed
,
ver
,
buffer
);
if
(
error_code
!=
SLURM_SUCCESS
)
goto
unpack_error
;
else
if
(
!
tmp
_fed
)
{
else
if
(
!
ret
_fed
)
{
error
(
"No feds retrieved"
);
}
if
(
tmp_fed
&&
tmp_fed
->
cluster_list
&&
!
(
cluster
=
list_find_first
(
tmp_fed
->
cluster_list
,
slurmdb_find_cluster_in_list
,
slurmctld_cluster_name
)))
{
error
(
"This cluster doesn't exist in the fed siblings"
);
slurmdb_destroy_federation_rec
(
tmp_fed
);
goto
unpack_error
;
}
else
if
(
cluster
)
{
_join_federation
(
tmp_fed
,
cluster
);
tmp_fed
=
NULL
;
}
else
{
/* We want to free the connections here since they don't exist
* anymore, but they were packed when state was saved. */
slurmdb_cluster_rec_t
*
cluster
;
ListIterator
itr
=
list_iterator_create
(
ret_fed
->
cluster_list
);
while
((
cluster
=
list_next
(
itr
)))
{
slurm_persist_conn_destroy
(
cluster
->
fed
.
recv
);
cluster
->
fed
.
recv
=
NULL
;
slurm_persist_conn_destroy
(
cluster
->
fed
.
send
);
cluster
->
fed
.
send
=
NULL
;
}
list_iterator_destroy
(
itr
);
}
free_buf
(
buffer
);
if
(
tmp_fed
)
slurmdb_destroy_federation_rec
(
tmp_fed
);
return
SLURM_SUCCESS
;
return
ret_fed
;
unpack_error:
free_buf
(
buffer
);
return
SLURM_ERROR
;
return
NULL
;
}
extern
int
_find_sibling_by_ip
(
void
*
x
,
void
*
key
)
...
...
This diff is collapsed.
Click to expand it.
src/slurmctld/fed_mgr.h
+
1
−
1
View file @
7d6c3b77
...
...
@@ -46,7 +46,7 @@ extern int fed_mgr_init(void *db_conn);
extern
int
fed_mgr_fini
();
extern
int
fed_mgr_update_feds
(
slurmdb_update_object_t
*
update
);
extern
int
fed_mgr_state_save
(
char
*
state_save_location
);
extern
in
t
fed_mgr_state_load
(
char
*
state_save_location
);
extern
slurmdb_federation_rec_
t
*
fed_mgr_state_load
(
char
*
state_save_location
);
extern
char
*
fed_mgr_find_sibling_name_by_ip
(
char
*
ip
);
extern
bool
fed_mgr_is_active
();
extern
uint32_t
fed_mgr_get_job_id
(
uint32_t
orig
);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment