From 6b371f524d4d5a2e9d9d02240bbfab82c5e4ed59 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Tue, 17 Oct 2006 18:45:30 +0000 Subject: [PATCH] fixes after the merge --- src/common/forward.c | 18 +++++++++------- src/common/slurm_protocol_api.c | 38 ++++++++++++++++----------------- src/common/slurm_protocol_api.h | 5 ++--- src/slurmctld/agent.c | 1 - src/slurmd/slurmd/slurmd.c | 3 +-- src/srun/allocate.c | 4 ++-- src/srun/opt.c | 2 +- 7 files changed, 35 insertions(+), 36 deletions(-) diff --git a/src/common/forward.c b/src/common/forward.c index ec62d72a708..428ff914945 100644 --- a/src/common/forward.c +++ b/src/common/forward.c @@ -107,13 +107,7 @@ void *_forward_thread(void *arg) xfree(fwd_msg->header.forward.nodelist); fwd_msg->header.forward.nodelist = xstrdup(buf); fwd_msg->header.forward.cnt = hostlist_count(hl); - if(fwd_msg->header.forward.cnt>0) { - steps = (fwd_msg->header.forward.cnt+1) / - slurm_get_tree_width(); - fwd_msg->timeout = (1000*steps); - steps++; - fwd_msg->timeout += (start_timeout*steps); - } + debug3("forward: along with %s", fwd_msg->header.forward.nodelist); @@ -167,7 +161,15 @@ void *_forward_thread(void *arg) } goto cleanup; } - + + if(fwd_msg->header.forward.cnt>0) { + steps = (fwd_msg->header.forward.cnt+1) / + slurm_get_tree_width(); + fwd_msg->timeout = (1000*steps); + steps++; + fwd_msg->timeout += (start_timeout*steps); + } + ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout); if(!ret_list || (fwd_msg->header.forward.cnt != 0 diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 97a24bd2ab5..b67819f9de8 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -839,7 +839,7 @@ int slurm_receive_msg(slurm_fd fd, slurm_msg_t *msg, int timeout) /* Forward message to other nodes */ if(header.forward.cnt > 0) { error("We need to forward this to other nodes use " - "slurm_receive_and_forward_msgs instead"); + "slurm_receive_msg_and_forward instead"); } if ((auth_cred = g_slurm_auth_unpack(buffer)) == NULL) { @@ -925,8 +925,9 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout) /* convert secs to msec */ timeout = slurm_get_msg_timeout() * 1000; if(steps) { - steps++; orig_timeout = timeout/steps; + steps--; + orig_timeout -= (1000*steps); } debug4("orig_timeout was %d we have %d steps and a timeout of %d", orig_timeout, steps, timeout); @@ -1055,12 +1056,11 @@ total_return: * IN open_fd - file descriptor to receive msg on * IN/OUT msg - a slurm_msg struct to be filled in by the function * we use the orig_addr from this var for forwarding. - * IN steps - how many steps down the tree we have to wait for * IN timeout - how long to wait in milliseconds * RET int - returns 0 on success, -1 on failure and sets errno */ -int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr, - slurm_msg_t *msg, int steps, int timeout) +int slurm_receive_msg_and_forward(slurm_fd fd, slurm_addr *orig_addr, + slurm_msg_t *msg, int timeout) { char *buf = NULL; size_t buflen = 0; @@ -1068,8 +1068,6 @@ int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr, int rc; void *auth_cred = NULL; Buf buffer; - ret_data_info_t *ret_data_info = NULL; - int orig_timeout = timeout; xassert(fd >= 0); @@ -1091,21 +1089,16 @@ int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr, if (timeout <= 0) /* convert secs to msec */ timeout = slurm_get_msg_timeout() * 1000; - if(steps) { - steps++; - orig_timeout = timeout/steps; - } - - if(orig_timeout >= (slurm_get_msg_timeout() * 10000)) { + + if(timeout >= (slurm_get_msg_timeout() * 10000)) { error("You are sending a message with timeout's greater " "than %d seconds, your's is %d seconds", (slurm_get_msg_timeout() * 10), (timeout/1000)); - } else if(orig_timeout < 1000) { + } else if(timeout < 1000) { debug("You are sending a message with a very short timeout of " "%d milliseconds", timeout); - } - + } /* * Receive a msg. slurm_msg_recvfrom() will read the message @@ -1134,14 +1127,21 @@ int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr, rc = SLURM_PROTOCOL_VERSION_ERROR; goto total_return; } - //info("ret_cnt = %d",header.ret_cnt); if(header.ret_cnt > 0) { - while((ret_data_info = list_pop(header.ret_list))) - list_push(msg->ret_list, ret_data_info); + error("we recieved more than one message back use " + "slurm_receive_msgs instead"); header.ret_cnt = 0; list_destroy(header.ret_list); header.ret_list = NULL; } + //info("ret_cnt = %d",header.ret_cnt); + /* if(header.ret_cnt > 0) { */ +/* while((ret_data_info = list_pop(header.ret_list))) */ +/* list_push(msg->ret_list, ret_data_info); */ +/* header.ret_cnt = 0; */ +/* list_destroy(header.ret_list); */ +/* header.ret_list = NULL; */ +/* } */ /* * header.orig_addr will be set to where the first message * came from if this is a forward else we set the diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 0e343c525cc..2259188da8e 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -351,12 +351,11 @@ List slurm_receive_msgs(slurm_fd fd, int steps, int timeout); * * IN open_fd - file descriptor to receive msg on * OUT resp - a slurm_msg struct to be filled in by the function - * IN steps - how many steps down the tree we have to wait for * IN timeout - how long to wait in milliseconds * RET int - returns 0 on success, -1 on failure and sets errno */ -int slurm_receive_and_forward_msgs(slurm_fd fd, slurm_addr *orig_addr, - slurm_msg_t *resp, int steps, int timeout); +int slurm_receive_msg_and_forward(slurm_fd fd, slurm_addr *orig_addr, + slurm_msg_t *resp, int timeout); /**********************************************************************\ * send message functions diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index f3ef27f7e2c..554cafa55cb 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -1029,7 +1029,6 @@ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count) j = 0; for (i = 0; i < agent_info_ptr->thread_count; i++) { if(!thread_ptr[i].ret_list) { - char ip_buf[32]; if (thread_ptr[i].state != DSH_NO_RESP) continue; diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index fea862f71cb..395ca18322b 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -364,8 +364,7 @@ _service_connection(void *arg) debug3("in the service_connection"); slurm_msg_t_init(msg); - if((rc = slurm_receive_and_forward_msgs( - con->fd, con->cli_addr, msg, 0, 0)) + if((rc = slurm_receive_msg_and_forward(con->fd, con->cli_addr, msg, 0)) != SLURM_SUCCESS) { error("service_connection: slurm_receive_msg: %m"); /* if this fails we need to make sure the nodes we forward diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 714195fb29e..788296e7a31 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -667,10 +667,10 @@ create_job_step(srun_job_t *job) /* Number of hosts in job may not have been initialized yet if - * --jobid was used or only SLURM_JOBID was set in user env. + * --jobid was used or only SLURM_JOBID was set in user env. * Reset the value here just in case. */ - job->nhosts = job->step_layout->num_hosts; + job->nhosts = job->step_layout->node_cnt; if(!job->step_layout) { error("step_layout not returned"); diff --git a/src/srun/opt.c b/src/srun/opt.c index abab6989bc3..ddc346f0407 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -923,7 +923,7 @@ static void _opt_default() opt.max_launch_time = 120;/* 120 seconds to launch job */ opt.max_exit_timeout= 60; /* Warn user 60 seconds after task exit */ /* Default launch msg timeout */ - opt.msg_timeout = SLURM_MESSAGE_TIMEOUT_SEC_STATIC; + opt.msg_timeout = slurm_get_msg_timeout(); for (i=0; i<SYSTEM_DIMENSIONS; i++) opt.geometry[i] = (uint16_t) NO_VAL; -- GitLab