Skip to content
Snippets Groups Projects
Commit baac2ca0 authored by Danny Auble's avatar Danny Auble
Browse files

fix to not print an error is the timeout is being increased by the forwarding...

fix to not print an error is the timeout is being increased by the forwarding logic instead of the user.  tests agains the original timeout instead of the pumped up timeout for the underlying layers in the tree it has to go to.
parent 9409d4f9
No related branches found
No related tags found
No related merge requests found
......@@ -73,7 +73,8 @@ void *_forward_thread(void *arg)
hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist);
slurm_addr addr;
char buf[8196];
int steps = 0;
/* repeat until we are sure the message was sent */
while((name = hostlist_shift(hl))) {
if(slurm_conf_get_addr(name, &addr) == SLURM_ERROR) {
......@@ -106,7 +107,11 @@ void *_forward_thread(void *arg)
xfree(fwd_msg->header.forward.nodelist);
fwd_msg->header.forward.nodelist = xstrdup(buf);
fwd_msg->header.forward.cnt = hostlist_count(hl);
if(fwd_msg->header.forward.cnt>0) {
steps = fwd_msg->header.forward.cnt /
slurm_get_tree_width();
steps += 1;
}
debug3("forward: along with %s",
fwd_msg->header.forward.nodelist);
......@@ -161,7 +166,7 @@ void *_forward_thread(void *arg)
goto cleanup;
}
ret_list = slurm_receive_msgs(fd, fwd_msg->timeout);
ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout);
if(!ret_list || (fwd_msg->header.forward.cnt != 0
&& list_count(ret_list) == 0)) {
......
......@@ -897,12 +897,13 @@ total_return:
* NOTE: memory is allocated for the returned list
* and must be freed at some point using the list_destroy function.
* IN open_fd - file descriptor to receive msg on
* IN steps - how many steps down the tree we have to wait for
* IN timeout - how long to wait in milliseconds
* RET List - List containing the responses of the childern (if any) we
* forwarded the message to. List containing type
* (ret_data_info_t).
*/
List slurm_receive_msgs(slurm_fd fd, int timeout)
List slurm_receive_msgs(slurm_fd fd, int steps, int timeout)
{
char *buf = NULL;
size_t buflen = 0;
......@@ -913,6 +914,7 @@ List slurm_receive_msgs(slurm_fd fd, int timeout)
Buf buffer;
ret_data_info_t *ret_data_info = NULL;
List ret_list = NULL;
int orig_timeout = 0;
xassert(fd >= 0);
......@@ -922,13 +924,18 @@ List slurm_receive_msgs(slurm_fd fd, int timeout)
if (timeout <= 0)
/* convert secs to msec */
timeout = slurm_get_msg_timeout() * 1000;
if(timeout >= (slurm_get_msg_timeout() * 10000)) {
if(steps) {
steps++;
orig_timeout = timeout/steps;
}
debug4("orig_timeout was %d we have %d steps and a timeout of %d",
orig_timeout, steps, timeout);
if(orig_timeout >= (slurm_get_msg_timeout() * 10000)) {
error("You are sending a message with timeout's greater "
"than %d seconds, your's is %d seconds",
(slurm_get_msg_timeout() * 10),
(timeout/1000));
} else if(timeout < 1000) {
} else if(orig_timeout < 1000) {
debug("You are sending a message with a very short timeout of "
"%d milliseconds", timeout);
}
......@@ -1048,11 +1055,12 @@ total_return:
* IN open_fd - file descriptor to receive msg on
* IN/OUT msg - a slurm_msg struct to be filled in by the function
* we use the orig_addr from this var for forwarding.
* IN steps - how many steps down the tree we have to wait for
* IN timeout - how long to wait in milliseconds
* RET int - returns 0 on success, -1 on failure and sets errno
*/
int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr,
slurm_msg_t *msg, int timeout)
slurm_msg_t *msg, int steps, int timeout)
{
char *buf = NULL;
size_t buflen = 0;
......@@ -1061,7 +1069,8 @@ int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr,
void *auth_cred = NULL;
Buf buffer;
ret_data_info_t *ret_data_info = NULL;
int orig_timeout = 0;
xassert(fd >= 0);
if(msg->forward.init != FORWARD_INIT)
......@@ -1082,13 +1091,17 @@ int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr,
if (timeout <= 0)
/* convert secs to msec */
timeout = slurm_get_msg_timeout() * 1000;
if(timeout >= (slurm_get_msg_timeout() * 10000)) {
if(steps) {
steps++;
orig_timeout = timeout/steps;
}
if(orig_timeout >= (slurm_get_msg_timeout() * 10000)) {
error("You are sending a message with timeout's greater "
"than %d seconds, your's is %d seconds",
(slurm_get_msg_timeout() * 10),
(timeout/1000));
} else if(timeout < 1000) {
} else if(orig_timeout < 1000) {
debug("You are sending a message with a very short timeout of "
"%d milliseconds", timeout);
}
......@@ -1743,7 +1756,7 @@ _send_and_recv_msgs(slurm_fd fd, slurm_msg_t *req, int timeout)
timeout += (req->forward.timeout*steps);
}
ret_list = slurm_receive_msgs(fd, timeout);
ret_list = slurm_receive_msgs(fd, steps, timeout);
}
......
......@@ -16,7 +16,7 @@
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
......@@ -328,13 +328,14 @@ int slurm_receive_msg(slurm_fd fd, slurm_msg_t *msg, int timeout);
* list_destroy function.
*
* IN open_fd - file descriptor to receive msg on
* IN steps - how many steps down the tree we have to wait for
* IN timeout - how long to wait in milliseconds
* RET List - List containing the responses of the childern (if any) we
* forwarded the message to. List containing type
* (ret_data_info_t). NULL is returned on failure. and
* errno set.
*/
List slurm_receive_msgs(slurm_fd fd, int timeout);
List slurm_receive_msgs(slurm_fd fd, int steps, int timeout);
/*
* Receive a slurm message on the open slurm descriptor "fd" waiting
......@@ -350,11 +351,12 @@ List slurm_receive_msgs(slurm_fd fd, int timeout);
*
* IN open_fd - file descriptor to receive msg on
* OUT resp - a slurm_msg struct to be filled in by the function
* IN steps - how many steps down the tree we have to wait for
* IN timeout - how long to wait in milliseconds
* RET int - returns 0 on success, -1 on failure and sets errno
*/
int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr,
slurm_msg_t *resp, int timeout);
slurm_msg_t *resp, int steps, int timeout);
/**********************************************************************\
* send message functions
......
......@@ -364,7 +364,7 @@ _service_connection(void *arg)
debug3("in the service_connection");
slurm_msg_t_init(msg);
if((rc = slurm_receive_and_forward_msgs(
con->fd, con->cli_addr, msg, 0))
con->fd, con->cli_addr, msg, 0, 0))
!= SLURM_SUCCESS) {
error("service_connection: slurm_receive_msg: %m");
/* if this fails we need to make sure the nodes we forward
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment