diff --git a/src/common/forward.c b/src/common/forward.c index ec62d72a70866e33e3b39f3d185fefb6d281b6a9..428ff91494561d7e961e6069826d6bc746832e16 100644 --- a/src/common/forward.c +++ b/src/common/forward.c @@ -107,13 +107,7 @@ void *_forward_thread(void *arg) xfree(fwd_msg->header.forward.nodelist); fwd_msg->header.forward.nodelist = xstrdup(buf); fwd_msg->header.forward.cnt = hostlist_count(hl); - if(fwd_msg->header.forward.cnt>0) { - steps = (fwd_msg->header.forward.cnt+1) / - slurm_get_tree_width(); - fwd_msg->timeout = (1000*steps); - steps++; - fwd_msg->timeout += (start_timeout*steps); - } + debug3("forward: along with %s", fwd_msg->header.forward.nodelist); @@ -167,7 +161,15 @@ void *_forward_thread(void *arg) } goto cleanup; } - + + if(fwd_msg->header.forward.cnt>0) { + steps = (fwd_msg->header.forward.cnt+1) / + slurm_get_tree_width(); + fwd_msg->timeout = (1000*steps); + steps++; + fwd_msg->timeout += (start_timeout*steps); + } + ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout); if(!ret_list || (fwd_msg->header.forward.cnt != 0 diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 97a24bd2ab54ff96d86ad037166a49023f3e6f81..b67819f9de8fd71613ee5daf4c8c4aa670d19570 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -839,7 +839,7 @@ int slurm_receive_msg(slurm_fd fd, slurm_msg_t *msg, int timeout) /* Forward message to other nodes */ if(header.forward.cnt > 0) { error("We need to forward this to other nodes use " - "slurm_receive_and_forward_msgs instead"); + "slurm_receive_msg_and_forward instead"); } if ((auth_cred = g_slurm_auth_unpack(buffer)) == NULL) { @@ -925,8 +925,9 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout) /* convert secs to msec */ timeout = slurm_get_msg_timeout() * 1000; if(steps) { - steps++; orig_timeout = timeout/steps; + steps--; + orig_timeout -= (1000*steps); } debug4("orig_timeout was %d we have %d steps and a timeout of %d", orig_timeout, steps, timeout); @@ -1055,12 +1056,11 @@ total_return: * IN open_fd - file descriptor to receive msg on * IN/OUT msg - a slurm_msg struct to be filled in by the function * we use the orig_addr from this var for forwarding. - * IN steps - how many steps down the tree we have to wait for * IN timeout - how long to wait in milliseconds * RET int - returns 0 on success, -1 on failure and sets errno */ -int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr, - slurm_msg_t *msg, int steps, int timeout) +int slurm_receive_msg_and_forward(slurm_fd fd, slurm_addr *orig_addr, + slurm_msg_t *msg, int timeout) { char *buf = NULL; size_t buflen = 0; @@ -1068,8 +1068,6 @@ int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr, int rc; void *auth_cred = NULL; Buf buffer; - ret_data_info_t *ret_data_info = NULL; - int orig_timeout = timeout; xassert(fd >= 0); @@ -1091,21 +1089,16 @@ int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr, if (timeout <= 0) /* convert secs to msec */ timeout = slurm_get_msg_timeout() * 1000; - if(steps) { - steps++; - orig_timeout = timeout/steps; - } - - if(orig_timeout >= (slurm_get_msg_timeout() * 10000)) { + + if(timeout >= (slurm_get_msg_timeout() * 10000)) { error("You are sending a message with timeout's greater " "than %d seconds, your's is %d seconds", (slurm_get_msg_timeout() * 10), (timeout/1000)); - } else if(orig_timeout < 1000) { + } else if(timeout < 1000) { debug("You are sending a message with a very short timeout of " "%d milliseconds", timeout); - } - + } /* * Receive a msg. slurm_msg_recvfrom() will read the message @@ -1134,14 +1127,21 @@ int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr, rc = SLURM_PROTOCOL_VERSION_ERROR; goto total_return; } - //info("ret_cnt = %d",header.ret_cnt); if(header.ret_cnt > 0) { - while((ret_data_info = list_pop(header.ret_list))) - list_push(msg->ret_list, ret_data_info); + error("we recieved more than one message back use " + "slurm_receive_msgs instead"); header.ret_cnt = 0; list_destroy(header.ret_list); header.ret_list = NULL; } + //info("ret_cnt = %d",header.ret_cnt); + /* if(header.ret_cnt > 0) { */ +/* while((ret_data_info = list_pop(header.ret_list))) */ +/* list_push(msg->ret_list, ret_data_info); */ +/* header.ret_cnt = 0; */ +/* list_destroy(header.ret_list); */ +/* header.ret_list = NULL; */ +/* } */ /* * header.orig_addr will be set to where the first message * came from if this is a forward else we set the diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 0e343c525cc6530d021c78c4418315e1e41b07bf..2259188da8ef79724bd9db3147c676048e60c039 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -351,12 +351,11 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout); * * IN open_fd - file descriptor to receive msg on * OUT resp - a slurm_msg struct to be filled in by the function - * IN steps - how many steps down the tree we have to wait for * IN timeout - how long to wait in milliseconds * RET int - returns 0 on success, -1 on failure and sets errno */ -int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr, - slurm_msg_t *resp, int steps, int timeout); +int slurm_receive_msg_and_forward(slurm_fd fd, slurm_addr *orig_addr, + slurm_msg_t *resp, int timeout); /**********************************************************************\ * send message functions diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index f3ef27f7e2cf29ff91272d1b9e119697899aabf7..554cafa55cb0ba843c2ff013f3e059da097724b7 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -1029,7 +1029,6 @@ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count) j = 0; for (i = 0; i < agent_info_ptr->thread_count; i++) { if(!thread_ptr[i].ret_list) { - char ip_buf[32]; if (thread_ptr[i].state != DSH_NO_RESP) continue; diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index fea862f71cb09aa0c4c368841ae0c78f8b66c58a..395ca18322b3b02419ec3d214e42f489cd0ff960 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -364,8 +364,7 @@ _service_connection(void *arg) debug3("in the service_connection"); slurm_msg_t_init(msg); - if((rc = slurm_receive_and_forward_msgs( - con->fd, con->cli_addr, msg, 0, 0)) + if((rc = slurm_receive_msg_and_forward(con->fd, con->cli_addr, msg, 0)) != SLURM_SUCCESS) { error("service_connection: slurm_receive_msg: %m"); /* if this fails we need to make sure the nodes we forward diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 714195fb29ee3f505e88a1c3d8346a5253dbcbba..788296e7a31977bff8162ec18763d372f4008878 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -667,10 +667,10 @@ create_job_step(srun_job_t *job) /* Number of hosts in job may not have been initialized yet if - * --jobid was used or only SLURM_JOBID was set in user env. + * --jobid was used or only SLURM_JOBID was set in user env. * Reset the value here just in case. */ - job->nhosts = job->step_layout->num_hosts; + job->nhosts = job->step_layout->node_cnt; if(!job->step_layout) { error("step_layout not returned"); diff --git a/src/srun/opt.c b/src/srun/opt.c index abab6989bc3e465a6462a6ab1702aa587410c4ed..ddc346f0407b458eeedf8a87299745de152c99a1 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -923,7 +923,7 @@ static void _opt_default() opt.max_launch_time = 120;/* 120 seconds to launch job */ opt.max_exit_timeout= 60; /* Warn user 60 seconds after task exit */ /* Default launch msg timeout */ - opt.msg_timeout = SLURM_MESSAGE_TIMEOUT_SEC_STATIC; + opt.msg_timeout = slurm_get_msg_timeout(); for (i=0; i<SYSTEM_DIMENSIONS; i++) opt.geometry[i] = (uint16_t) NO_VAL;