diff --git a/src/common/forward.c b/src/common/forward.c index 5044ff04e34c724fc5f3fda355bb998e79eb132a..37cdbf3da0a9b9dfd8fb2e204ab44d7da8750b04 100644 --- a/src/common/forward.c +++ b/src/common/forward.c @@ -270,14 +270,11 @@ extern int forward_msg(forward_struct_t *forward_struct, /* * forward_set - add to the message possible forwards to go to - * IN: forward - forward_t * - message to add forwards to - * IN: thr_count - int - number of messages already done - * IN: pos - int * - posistion in the forward_addr and names - * will change to update to set the - * correct start after forwarding - * information has been added. - * IN: forward_addr- sockaddr_in * - list of address structures to forward to - * IN: forward_names - char * - list of names in MAX_SLURM_NAME increments + * IN: forward - forward_t * - struct to store forward info + * IN: span - int - count of forwards to do + * IN: pos - int * - position in the original messages addr + * structure + * IN: from - forward_t * - information from original message * RET: SLURM_SUCCESS - int */ extern int forward_set(forward_t *forward, @@ -331,6 +328,20 @@ extern int forward_set(forward_t *forward, return SLURM_SUCCESS; } +/* + * forward_set_launch - add to the message possible forwards to go to during + * a job launch + * IN: forward - forward_t * - struct to store forward info + * IN: span - int - count of forwards to do + * IN: step_layout - slurm_step_layout_t * - contains information about hosts + * from original message + * IN: slurmd_addr - slurm_addr * - addrs of hosts to send messages to + * IN: itr - hostlist_iterator_t - count into host list of hosts to + * send messages to + * IN: timeout - int32_t - timeout if any to wait for + * message responses + * RET: SLURM_SUCCESS - int + */ extern int forward_set_launch(forward_t *forward, int span, int *pos, @@ -352,7 +363,8 @@ extern int forward_set_launch(forward_t *forward, if(span > 0) { forward->addr = xmalloc(sizeof(slurm_addr) * span); - forward->name = xmalloc(sizeof(char) * (MAX_SLURM_NAME * span)); + forward->name = + xmalloc(sizeof(char) * (MAX_SLURM_NAME * span)); forward->node_id = xmalloc(sizeof(int32_t) * span); forward->timeout = timeout; forward->init = FORWARD_INIT; diff --git a/src/common/forward.h b/src/common/forward.h index 8f6961b6547eefcd2bd671b94b93001e71f1a504..8e3eb797ae4ae4d5e267a0fdb7b36a9faadeb373 100644 --- a/src/common/forward.h +++ b/src/common/forward.h @@ -34,23 +34,174 @@ #include "src/common/dist_tasks.h" /* STRUCTURES */ +/* + * forward_init - initilize forward structure + * IN: forward - forward_t * - struct to store forward info + * IN: from - forward_t * - (OPTIONAL) can be NULL, can be used to + * init the forward to this state + * RET: VOID + */ extern void forward_init(forward_t *forward, forward_t *from); +/* + * forward_msg - logic to forward and collect return codes from childern + * of a parent forward + * IN: forward_struct - forward_struct_t * - holds information about message + * that needs to be forwarded to + * childern processes + * IN: header - header_t - header from message that came in + * needing to be forwarded. + * RET: SLURM_SUCCESS - int + */ + +/********************************************************************* +Code taken from common/slurm_protocol_api.c +//This function should only be used when a message is being recieved. + +//set up the forward_struct off of the buffer being received right after +//header is pulled off the received buffer + +forward_struct = xmalloc(sizeof(forward_struct_t)); +forward_struct->buf_len = remaining_buf(buffer); +forward_struct->buf = xmalloc(sizeof(char) * forward_struct->buf_len); +memcpy(forward_struct->buf, &buffer->head[buffer->processed], + forward_struct->buf_len); +forward_struct->ret_list = ret_list; + +forward_struct->timeout = timeout - header.forward.timeout; + +//send the structure created off the buffer and the header from the message +if(forward_msg(forward_struct, &header) == SLURM_ERROR) { + error("problem with forward msg"); +} + +*********************************************************************/ + extern int forward_msg(forward_struct_t *forward_struct, header_t *header); /* - * set_forward_addrs - add to the message possible forwards to go to + * forward_set - add to the message possible forwards to go to * IN: forward - forward_t * - struct to store forward info - * IN: thr_count - int - number of messages already done - * IN: from - forward_t * - info to separate into new forward struct + * IN: span - int - count of forwards to do + * IN: pos - int * - position in the original messages + * structures + * IN: from - forward_t * - information from original message * RET: SLURM_SUCCESS - int */ +/******************************************************************** +Code taken from slurmctld/agent.c +This function should be used sending a message that could be forwarded. + +//set the span with total count of hosts to send to +int *span = set_span(agent_arg_ptr->node_count); + +// fill in a local forward structure with count of thread to create +// array of names and addrs of hosts and node_id (if any) to be sent to +// along with the timeout of the message +forward.cnt = agent_info_ptr->thread_count; +forward.name = agent_arg_ptr->node_names; +forward.addr = agent_arg_ptr->slurm_addr; +forward.node_id = NULL; +forward.timeout = SLURM_MESSAGE_TIMEOUT_MSEC_STATIC; + +for (i = 0; i < agent_info_ptr->thread_count; i++) { + thread_ptr[thr_count].state = DSH_NEW; + thread_ptr[thr_count].slurm_addr = agent_arg_ptr->slurm_addr[i]; + strncpy(thread_ptr[thr_count].node_name, + &agent_arg_ptr->node_names[i * MAX_SLURM_NAME], + MAX_SLURM_NAME); +// for each 'main' thread we want to add hosts for this one to forward to. +// send the thread_ptr's forward, span at the thr_count, the address of +// position we are in the count, and the forward we set up earlier + forward_set(&thread_ptr[thr_count].forward, + span[thr_count], + &i, + &forward); + + thr_count++; +} + +//free the span +xfree(span); +// set the new thread_count to the number with the forwards taken out of the +// count since we don't keep track of those on the master sender +agent_info_ptr->thread_count = thr_count; +********************************************************************/ extern int forward_set (forward_t *forward, int span, int *pos, forward_t *from); +/* + * forward_set_launch - add to the message possible forwards to go to during + * a job launch + * IN: forward - forward_t * - struct to store forward info + * IN: span - int - count of forwards to do + * IN: step_layout - slurm_step_layout_t * - contains information about hosts + * from original message + * IN: slurmd_addr - slurm_addr * - addrs of hosts to send messages to + * IN: itr - hostlist_iterator_t - count into host list of hosts to + * send messages to + * IN: timeout - int32_t - timeout if any to wait for + * message responses + * RET: SLURM_SUCCESS - int + */ + +/******************************************************************** +Code taken from srun/launch.c +This function should be used sending a launch message that could be forwarded. + +//set the span with total count of hosts to send to +int *span = set_span(job->step_layout->num_hosts); + +//set up hostlist off the nodelist of the job +hostlist = hostlist_create(job->nodelist); +itr = hostlist_iterator_create(hostlist); +job->thr_count = 0; +for (i = 0; i < job->step_layout->num_hosts; i++) { + slurm_msg_t *m = &msg_array_ptr[job->thr_count]; + + m->srun_node_id = (uint32_t)i; + m->msg_type = REQUEST_LAUNCH_TASKS; + m->data = &r; + m->ret_list = NULL; +// set orig_add.sin_addr.s_addr to 0 meaning there is no one +// forwarded this message to this node + m->orig_addr.sin_addr.s_addr = 0; + m->buffer = buffer; + + j=0; + while(host = hostlist_next(itr)) { + if(!strcmp(host,job->step_layout->host[i])) { + free(host); + break; + } + j++; + free(host); + } + hostlist_iterator_reset(itr); + memcpy(&m->address, + &job->slurmd_addr[j], + sizeof(slurm_addr)); + +// send the messages forward struct to be filled in with the information from +// the other variables + forward_set_launch(&m->forward, + span[job->thr_count], + &i, + job->step_layout, + job->slurmd_addr, + itr, + opt.msg_timeout); +//increment the count of threads created + job->thr_count++; +} +//free the span and destroy the hostlist we created +xfree(span); +hostlist_iterator_destroy(itr); +hostlist_destroy(hostlist); +********************************************************************/ extern int forward_set_launch (forward_t *forward, int span, int *pos, @@ -59,6 +210,29 @@ extern int forward_set_launch (forward_t *forward, hostlist_iterator_t itr, int32_t timeout); +/* + * no_resp_forward - Used to respond for nodes not able to respond since + * the parent had failed in some way + * IN: forward - forward_t * - + * IN: ret_list - List * - + * IN: err - int - type of error from parent + * RET: SLURM_SUCCESS - int + */ +/********************************************************************* +Code taken from common/slurm_protocol_api.c +//This function should only be used after a message is recieved. + +// a call to slurm_receive_msg will fill in a ret_list + ret_list = slurm_receive_msg(fd, resp, timeout); +} + +// if ret_list is null or list_count is 0 means there may have been an error +// this fuction will check to make sure if there were supposed to be forwards +// we handle the return code for the messages +if(!ret_list || list_count(ret_list) == 0) { + no_resp_forwards(&req->forward, &ret_list, errno); +} +**********************************************************************/ extern int no_resp_forwards(forward_t *forward, List *ret_list, int err); /* destroyers */