Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Slurm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tud-zih-energy
Slurm
Commits
0f62d94e
Commit
0f62d94e
authored
19 years ago
by
Danny Auble
Browse files
Options
Downloads
Patches
Plain Diff
forward documentation
parent
91f3a749
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/common/forward.c
+21
-9
21 additions, 9 deletions
src/common/forward.c
src/common/forward.h
+177
-3
177 additions, 3 deletions
src/common/forward.h
with
198 additions
and
12 deletions
src/common/forward.c
+
21
−
9
View file @
0f62d94e
...
...
@@ -270,14 +270,11 @@ extern int forward_msg(forward_struct_t *forward_struct,
/*
* forward_set - add to the message possible forwards to go to
* IN: forward - forward_t * - message to add forwards to
* IN: thr_count - int - number of messages already done
* IN: pos - int * - posistion in the forward_addr and names
* will change to update to set the
* correct start after forwarding
* information has been added.
* IN: forward_addr- sockaddr_in * - list of address structures to forward to
* IN: forward_names - char * - list of names in MAX_SLURM_NAME increments
* IN: forward - forward_t * - struct to store forward info
* IN: span - int - count of forwards to do
* IN: pos - int * - position in the original messages addr
* structure
* IN: from - forward_t * - information from original message
* RET: SLURM_SUCCESS - int
*/
extern
int
forward_set
(
forward_t
*
forward
,
...
...
@@ -331,6 +328,20 @@ extern int forward_set(forward_t *forward,
return
SLURM_SUCCESS
;
}
/*
* forward_set_launch - add to the message possible forwards to go to during
* a job launch
* IN: forward - forward_t * - struct to store forward info
* IN: span - int - count of forwards to do
* IN: step_layout - slurm_step_layout_t * - contains information about hosts
* from original message
* IN: slurmd_addr - slurm_addr * - addrs of hosts to send messages to
* IN: itr - hostlist_iterator_t - count into host list of hosts to
* send messages to
* IN: timeout - int32_t - timeout if any to wait for
* message responses
* RET: SLURM_SUCCESS - int
*/
extern
int
forward_set_launch
(
forward_t
*
forward
,
int
span
,
int
*
pos
,
...
...
@@ -352,7 +363,8 @@ extern int forward_set_launch(forward_t *forward,
if
(
span
>
0
)
{
forward
->
addr
=
xmalloc
(
sizeof
(
slurm_addr
)
*
span
);
forward
->
name
=
xmalloc
(
sizeof
(
char
)
*
(
MAX_SLURM_NAME
*
span
));
forward
->
name
=
xmalloc
(
sizeof
(
char
)
*
(
MAX_SLURM_NAME
*
span
));
forward
->
node_id
=
xmalloc
(
sizeof
(
int32_t
)
*
span
);
forward
->
timeout
=
timeout
;
forward
->
init
=
FORWARD_INIT
;
...
...
This diff is collapsed.
Click to expand it.
src/common/forward.h
+
177
−
3
View file @
0f62d94e
...
...
@@ -34,23 +34,174 @@
#include
"src/common/dist_tasks.h"
/* STRUCTURES */
/*
* forward_init - initilize forward structure
* IN: forward - forward_t * - struct to store forward info
* IN: from - forward_t * - (OPTIONAL) can be NULL, can be used to
* init the forward to this state
* RET: VOID
*/
extern
void
forward_init
(
forward_t
*
forward
,
forward_t
*
from
);
/*
* forward_msg - logic to forward and collect return codes from childern
* of a parent forward
* IN: forward_struct - forward_struct_t * - holds information about message
* that needs to be forwarded to
* childern processes
* IN: header - header_t - header from message that came in
* needing to be forwarded.
* RET: SLURM_SUCCESS - int
*/
/*********************************************************************
Code taken from common/slurm_protocol_api.c
//This function should only be used when a message is being recieved.
//set up the forward_struct off of the buffer being received right after
//header is pulled off the received buffer
forward_struct = xmalloc(sizeof(forward_struct_t));
forward_struct->buf_len = remaining_buf(buffer);
forward_struct->buf = xmalloc(sizeof(char) * forward_struct->buf_len);
memcpy(forward_struct->buf, &buffer->head[buffer->processed],
forward_struct->buf_len);
forward_struct->ret_list = ret_list;
forward_struct->timeout = timeout - header.forward.timeout;
//send the structure created off the buffer and the header from the message
if(forward_msg(forward_struct, &header) == SLURM_ERROR) {
error("problem with forward msg");
}
*********************************************************************/
extern
int
forward_msg
(
forward_struct_t
*
forward_struct
,
header_t
*
header
);
/*
*
set_
forward_
addrs
- add to the message possible forwards to go to
* forward_
set
- add to the message possible forwards to go to
* IN: forward - forward_t * - struct to store forward info
* IN: thr_count - int - number of messages already done
* IN: from - forward_t * - info to separate into new forward struct
* IN: span - int - count of forwards to do
* IN: pos - int * - position in the original messages
* structures
* IN: from - forward_t * - information from original message
* RET: SLURM_SUCCESS - int
*/
/********************************************************************
Code taken from slurmctld/agent.c
This function should be used sending a message that could be forwarded.
//set the span with total count of hosts to send to
int *span = set_span(agent_arg_ptr->node_count);
// fill in a local forward structure with count of thread to create
// array of names and addrs of hosts and node_id (if any) to be sent to
// along with the timeout of the message
forward.cnt = agent_info_ptr->thread_count;
forward.name = agent_arg_ptr->node_names;
forward.addr = agent_arg_ptr->slurm_addr;
forward.node_id = NULL;
forward.timeout = SLURM_MESSAGE_TIMEOUT_MSEC_STATIC;
for (i = 0; i < agent_info_ptr->thread_count; i++) {
thread_ptr[thr_count].state = DSH_NEW;
thread_ptr[thr_count].slurm_addr = agent_arg_ptr->slurm_addr[i];
strncpy(thread_ptr[thr_count].node_name,
&agent_arg_ptr->node_names[i * MAX_SLURM_NAME],
MAX_SLURM_NAME);
// for each 'main' thread we want to add hosts for this one to forward to.
// send the thread_ptr's forward, span at the thr_count, the address of
// position we are in the count, and the forward we set up earlier
forward_set(&thread_ptr[thr_count].forward,
span[thr_count],
&i,
&forward);
thr_count++;
}
//free the span
xfree(span);
// set the new thread_count to the number with the forwards taken out of the
// count since we don't keep track of those on the master sender
agent_info_ptr->thread_count = thr_count;
********************************************************************/
extern
int
forward_set
(
forward_t
*
forward
,
int
span
,
int
*
pos
,
forward_t
*
from
);
/*
* forward_set_launch - add to the message possible forwards to go to during
* a job launch
* IN: forward - forward_t * - struct to store forward info
* IN: span - int - count of forwards to do
* IN: step_layout - slurm_step_layout_t * - contains information about hosts
* from original message
* IN: slurmd_addr - slurm_addr * - addrs of hosts to send messages to
* IN: itr - hostlist_iterator_t - count into host list of hosts to
* send messages to
* IN: timeout - int32_t - timeout if any to wait for
* message responses
* RET: SLURM_SUCCESS - int
*/
/********************************************************************
Code taken from srun/launch.c
This function should be used sending a launch message that could be forwarded.
//set the span with total count of hosts to send to
int *span = set_span(job->step_layout->num_hosts);
//set up hostlist off the nodelist of the job
hostlist = hostlist_create(job->nodelist);
itr = hostlist_iterator_create(hostlist);
job->thr_count = 0;
for (i = 0; i < job->step_layout->num_hosts; i++) {
slurm_msg_t *m = &msg_array_ptr[job->thr_count];
m->srun_node_id = (uint32_t)i;
m->msg_type = REQUEST_LAUNCH_TASKS;
m->data = &r;
m->ret_list = NULL;
// set orig_add.sin_addr.s_addr to 0 meaning there is no one
// forwarded this message to this node
m->orig_addr.sin_addr.s_addr = 0;
m->buffer = buffer;
j=0;
while(host = hostlist_next(itr)) {
if(!strcmp(host,job->step_layout->host[i])) {
free(host);
break;
}
j++;
free(host);
}
hostlist_iterator_reset(itr);
memcpy(&m->address,
&job->slurmd_addr[j],
sizeof(slurm_addr));
// send the messages forward struct to be filled in with the information from
// the other variables
forward_set_launch(&m->forward,
span[job->thr_count],
&i,
job->step_layout,
job->slurmd_addr,
itr,
opt.msg_timeout);
//increment the count of threads created
job->thr_count++;
}
//free the span and destroy the hostlist we created
xfree(span);
hostlist_iterator_destroy(itr);
hostlist_destroy(hostlist);
********************************************************************/
extern
int
forward_set_launch
(
forward_t
*
forward
,
int
span
,
int
*
pos
,
...
...
@@ -59,6 +210,29 @@ extern int forward_set_launch (forward_t *forward,
hostlist_iterator_t
itr
,
int32_t
timeout
);
/*
* no_resp_forward - Used to respond for nodes not able to respond since
* the parent had failed in some way
* IN: forward - forward_t * -
* IN: ret_list - List * -
* IN: err - int - type of error from parent
* RET: SLURM_SUCCESS - int
*/
/*********************************************************************
Code taken from common/slurm_protocol_api.c
//This function should only be used after a message is recieved.
// a call to slurm_receive_msg will fill in a ret_list
ret_list = slurm_receive_msg(fd, resp, timeout);
}
// if ret_list is null or list_count is 0 means there may have been an error
// this fuction will check to make sure if there were supposed to be forwards
// we handle the return code for the messages
if(!ret_list || list_count(ret_list) == 0) {
no_resp_forwards(&req->forward, &ret_list, errno);
}
**********************************************************************/
extern
int
no_resp_forwards
(
forward_t
*
forward
,
List
*
ret_list
,
int
err
);
/* destroyers */
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment