From 94e863de168a201a4b72aa08e70dd83836766ae0 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Thu, 13 Jul 2006 19:14:45 +0000 Subject: [PATCH] svn merge -r8549:8565 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1 --- src/common/forward.c | 5 ++ src/common/hostlist.c | 75 +++++++++++++++----- src/common/slurm_protocol_api.c | 4 +- src/common/slurm_protocol_defs.h | 1 + src/common/slurm_protocol_pack.c | 3 + src/sbcast/agent.c | 5 +- src/sinfo/opts.c | 14 +++- src/slurmctld/agent.c | 114 ++++++++++++++++++++++++------- 8 files changed, 176 insertions(+), 45 deletions(-) diff --git a/src/common/forward.c b/src/common/forward.c index e922ba27640..817d1a59db2 100644 --- a/src/common/forward.c +++ b/src/common/forward.c @@ -497,6 +497,8 @@ extern int forward_msg_to_next(forward_msg_t *fwd_msg, int err) if(fwd_msg->ret_list) { ret_data_info = xmalloc(sizeof(ret_data_info_t)); ret_data_info->node_name = xstrdup(fwd_msg->node_name); + memcpy(&ret_data_info->addr, &fwd_msg->addr, + sizeof(slurm_addr)); ret_data_info->nodeid = fwd_msg->header.srun_node_id; itr = list_iterator_create(fwd_msg->ret_list); while((type = (ret_types_t *) list_next(itr)) != NULL) { @@ -747,6 +749,8 @@ extern int no_resp_forwards(forward_t *forward, List *ret_list, int err) &forward->name[i * MAX_SLURM_NAME], MAX_SLURM_NAME); ret_data_info->node_name = xstrdup(name); + memcpy(&ret_data_info->addr, &forward->addr[i], + sizeof(slurm_addr)); ret_data_info->nodeid = forward->node_id[i]; } no_forward: @@ -769,6 +773,7 @@ void destroy_forward(forward_t *forward) xfree(forward->name); xfree(forward->node_id); forward->cnt = 0; + forward->init = 0; } } diff --git a/src/common/hostlist.c b/src/common/hostlist.c index c02420ff26c..8010b1a3d9a 100644 --- a/src/common/hostlist.c +++ b/src/common/hostlist.c @@ -519,7 +519,10 @@ static int _width_equiv(unsigned long n, int *wn, unsigned long m, int *wm) */ static size_t host_prefix_end(const char *hostname) { - size_t idx = strlen(hostname) - 1; + size_t idx; + if (!hostname) + return -1; + idx = strlen(hostname) - 1; while (idx >= 0 && isdigit((char) hostname[idx])) idx--; @@ -598,6 +601,8 @@ static void hostname_destroy(hostname_t hn) */ static int hostname_suffix_is_valid(hostname_t hn) { + if (!hn) + return false; return hn->suffix != NULL; } @@ -605,6 +610,8 @@ static int hostname_suffix_is_valid(hostname_t hn) */ static int hostname_suffix_width(hostname_t hn) { + if (!hn) + return -1; assert(hn->suffix != NULL); return (int) strlen(hn->suffix); } @@ -768,12 +775,12 @@ static int hostrange_cmp(hostrange_t h1, hostrange_t h2) /* compare the prefixes of two hostrange objects. * returns: - * < 0 if h1 prefix is less than h2 OR h1 == NULL. + * < 0 if h1 prefix is less than h2 OR h2 == NULL. * * 0 if h1's prefix and h2's prefix match, * UNLESS, either h1 or h2 (NOT both) do not have a valid suffix. * - * > 0 if h1's prefix is greater than h2's OR h2 == NULL. */ + * > 0 if h1's prefix is greater than h2's OR h1 == NULL. */ static int hostrange_prefix_cmp(hostrange_t h1, hostrange_t h2) { int retval; @@ -982,6 +989,8 @@ hostrange_to_string(hostrange_t hr, size_t n, char *buf, char *separator) if (n == 0) return 0; + + assert(hr != NULL); if (hr->singlehost) return snprintf(buf, n, "%s", hr->prefix); @@ -1017,6 +1026,7 @@ static size_t hostrange_numstr(hostrange_t hr, size_t n, char *buf) int len = 0; assert(buf != NULL); + assert(hr != NULL); if (hr->singlehost || n == 0) return 0; @@ -1245,8 +1255,11 @@ hostlist_t _hostlist_create(const char *hostlist, char *sep, char *r_op) hostlist_t new = hostlist_new(); - orig = str = strdup(hostlist); + if (hostlist == NULL) + return new; + orig = str = strdup(hostlist); + /* return an empty list if an empty string was passed in */ if (str == NULL || strlen(str) == 0) goto done; @@ -1356,7 +1369,8 @@ hostlist_t _hostlist_create(const char *hostlist, char *sep, char *r_op) } done: - free(orig); + if(orig) + free(orig); return new; } @@ -1412,7 +1426,7 @@ static int _parse_single_range(const char *str, struct _range *range) free(orig); range->width = strlen(str); return 1; - + error: errno = EINVAL; _error(__FILE__, __LINE__, "Invalid range: `%s'", orig); @@ -1499,6 +1513,7 @@ _push_range_list(hostlist_t hl, char *pfx, struct _range *rng, int n) { int i; + for (i = 0; i < n; i++) { hostlist_push_hr(hl, pfx, rng->lo, rng->hi, rng->width); rng++; @@ -1571,7 +1586,7 @@ hostlist_t hostlist_copy(const hostlist_t hl) int i; hostlist_t new; - if (hl == NULL) + if (!hl) return NULL; LOCK_HOSTLIST(hl); @@ -1595,7 +1610,7 @@ hostlist_t hostlist_copy(const hostlist_t hl) void hostlist_destroy(hostlist_t hl) { int i; - if (hl == NULL) + if (!hl) return; LOCK_HOSTLIST(hl); while (hl->ilist) { @@ -1617,7 +1632,7 @@ int hostlist_push(hostlist_t hl, const char *hosts) { hostlist_t new; int retval; - if (hosts == NULL) + if (!hosts || !hl) return 0; new = hostlist_create(hosts); if (!new) @@ -1635,7 +1650,7 @@ int hostlist_push_host(hostlist_t hl, const char *str) hostrange_t hr; hostname_t hn; - if (str == NULL) + if (!str || !hl) return 0; hn = hostname_create(str); @@ -1658,7 +1673,7 @@ int hostlist_push_list(hostlist_t h1, hostlist_t h2) { int i, n = 0; - if (h2 == NULL) + if (!h2 || !h1) return 0; LOCK_HOSTLIST(h2); @@ -1675,7 +1690,11 @@ int hostlist_push_list(hostlist_t h1, hostlist_t h2) char *hostlist_pop(hostlist_t hl) { char *host = NULL; - + if(!hl) { + error("hostlist_pop: no hoslist given"); + return NULL; + } + LOCK_HOSTLIST(hl); if (hl->nhosts > 0) { hostrange_t hr = hl->hr[hl->nranges - 1]; @@ -1696,6 +1715,10 @@ static void hostlist_shift_iterators(hostlist_t hl, int idx, int depth, int n) { hostlist_iterator_t i; + if(!hl) { + error("hostlist_shift_iterators: no hoslist given"); + return; + } for (i = hl->ilist; i; i = i->next) { if (n == 0) { if (i->idx == idx && i->depth >= depth) @@ -1715,6 +1738,10 @@ char *hostlist_shift(hostlist_t hl) { char *host = NULL; + if(!hl){ + error("hostlist_shift: no hoslist given"); + return NULL; + } LOCK_HOSTLIST(hl); if (hl->nhosts > 0) { @@ -1743,6 +1770,8 @@ char *hostlist_pop_range(hostlist_t hl) hostlist_t hltmp; hostrange_t tail; + if(!hl) + return NULL; LOCK_HOSTLIST(hl); if (hl->nranges < 1 || !(hltmp = hostlist_new())) { UNLOCK_HOSTLIST(hl); @@ -1774,7 +1803,7 @@ char *hostlist_shift_range(hostlist_t hl) int i; char buf[1024]; hostlist_t hltmp = hostlist_new(); - if (!hltmp) + if (!hltmp || !hl) return NULL; LOCK_HOSTLIST(hl); @@ -1816,7 +1845,9 @@ int hostlist_delete(hostlist_t hl, const char *hosts) int n = 0; char *hostname = NULL; hostlist_t hltmp; - + if(!hl) + return -1; + if (!(hltmp = hostlist_create(hosts))) seterrno_ret(EINVAL, 0); @@ -1833,7 +1864,12 @@ int hostlist_delete(hostlist_t hl, const char *hosts) /* XXX watch out! poor implementation follows! (fix it at some point) */ int hostlist_delete_host(hostlist_t hl, const char *hostname) { - int n = hostlist_find(hl, hostname); + int n; + + if(!hl) + return -1; + n = hostlist_find(hl, hostname); + if (n >= 0) hostlist_delete_nth(hl, n); return n >= 0 ? 1 : 0; @@ -1857,6 +1893,8 @@ char * hostlist_nth(hostlist_t hl, int n) char *host = NULL; int i, count; + if(!hl) + return NULL; LOCK_HOSTLIST(hl); count = 0; for (i = 0; i < hl->nranges; i++) { @@ -1879,6 +1917,8 @@ int hostlist_delete_nth(hostlist_t hl, int n) { int i, count; + if(!hl) + return -1; LOCK_HOSTLIST(hl); assert(n >= 0 && n <= hl->nhosts); @@ -1915,6 +1955,9 @@ int hostlist_delete_nth(hostlist_t hl, int n) int hostlist_count(hostlist_t hl) { int retval; + if(!hl) + return -1; + LOCK_HOSTLIST(hl); retval = hl->nhosts; UNLOCK_HOSTLIST(hl); @@ -1926,7 +1969,7 @@ int hostlist_find(hostlist_t hl, const char *hostname) int i, count, ret = -1; hostname_t hn; - if (!hostname) + if (!hostname || !hl) return -1; hn = hostname_create(hostname); diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 20c42298edf..7baabf5edae 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -1578,7 +1578,7 @@ static List _send_recv_rc_msg(slurm_fd fd, slurm_msg_t *req, int timeout) } ret_data_info = xmalloc(sizeof(ret_data_info_t)); - ret_data_info->node_name = xstrdup("localhost"); + ret_data_info->node_name = NULL; ret_data_info->data = NULL; debug3("got reply for %s rc %d %d", ret_data_info->node_name, @@ -1674,7 +1674,7 @@ failed: } ret_data_info = xmalloc(sizeof(ret_data_info_t)); - ret_data_info->node_name = xstrdup("localhost"); + ret_data_info->node_name = NULL; ret_data_info->data = NULL; itr = list_iterator_create(ret_list); while((ret_type = list_next(itr)) != NULL) { diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index cf10651182a..47b686dc63a 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -252,6 +252,7 @@ typedef struct slurm_msg { typedef struct ret_data_info { char *node_name; + slurm_addr addr; uint32_t nodeid; void *data; } ret_data_info_t; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index d24e89dbf1a..7e2e82eec37 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -3121,6 +3121,7 @@ _pack_ret_list(List ret_list, itr_data = list_iterator_create(ret_type->ret_data_list); while((ret_data_info = list_next(itr_data)) != NULL) { packstr(ret_data_info->node_name, buffer); + slurm_pack_slurm_addr(&ret_data_info->addr, buffer); pack32((uint32_t)ret_data_info->nodeid, buffer); msg.data = ret_data_info->data; pack_msg(&msg, buffer); @@ -3156,6 +3157,8 @@ _unpack_ret_list(List *ret_list, ret_data_info = xmalloc(sizeof(ret_data_info_t)); safe_unpackstr_xmalloc(&ret_data_info->node_name, &uint16_tmp, buffer); + slurm_unpack_slurm_addr_no_alloc(&ret_data_info->addr, + buffer); safe_unpack32((uint32_t *)&ret_data_info->nodeid, buffer); if (unpack_msg(&msg, buffer) != SLURM_SUCCESS) diff --git a/src/sbcast/agent.c b/src/sbcast/agent.c index d2a52fde1e0..22d292188c5 100644 --- a/src/sbcast/agent.c +++ b/src/sbcast/agent.c @@ -88,11 +88,10 @@ static void *_agent_thread(void *args) while ((ret_data_info = list_next(data_itr)) != NULL) { if (ret_type->msg_rc == SLURM_SUCCESS) continue; - if (!strcmp(ret_data_info->node_name, - "localhost")) { - xfree(ret_data_info->node_name); + if (!ret_data_info->node_name) { ret_data_info->node_name = xstrdup(thread_ptr->node_name); + ret_data_info->addr = msg->address; } error("REQUEST_FILE_BCAST(%s): %s", ret_data_info->node_name, diff --git a/src/sinfo/opts.c b/src/sinfo/opts.c index d6d0d6848ff..a7002ee7f9b 100644 --- a/src/sinfo/opts.c +++ b/src/sinfo/opts.c @@ -73,6 +73,7 @@ extern void parse_command_line(int argc, char *argv[]) char *env_val = NULL; int opt_char; int option_index; + hostlist_t host_list; static struct option long_options[] = { {"all", no_argument, 0, 'a'}, {"bg", no_argument, 0, 'b'}, @@ -136,7 +137,8 @@ extern void parse_command_line(int argc, char *argv[]) case (int) 'i': params.iterate= atoi(optarg); if (params.iterate <= 0) { - error ("Error: --iterate=%s"); + error ("Error: invalid entry for " + "--iterate=%s", optarg); exit(1); } break; @@ -145,6 +147,16 @@ extern void parse_command_line(int argc, char *argv[]) break; case (int) 'n': params.nodes= xstrdup(optarg); + /* + * confirm valid nodelist entry + */ + host_list = hostlist_create(params.nodes); + if (!host_list) { + error("'%s' invalid entry for --nodes", + optarg); + exit(1); + } + hostlist_destroy(host_list); break; case (int) 'N': params.node_flag = true; diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 07d7861a9c8..2d47e1ddfde 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -161,6 +161,8 @@ static void _notify_slurmctld_nodes(agent_info_t *agent_ptr, int no_resp_cnt, int retry_cnt); static void _purge_agent_args(agent_arg_t *agent_arg_ptr); static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count); +static int _setup_requeue(agent_arg_t *agent_arg_ptr, thd_t *thread_ptr, + int count, int *spot); static void _slurmctld_free_job_launch_msg(batch_job_launch_msg_t * msg); static void _spawn_retry_agent(agent_arg_t * agent_arg_ptr); static void *_thread_per_group_rpc(void *args); @@ -455,7 +457,7 @@ static void _update_wdog_state(thd_t *thread_ptr, static void *_wdog(void *args) { bool srun_agent = false; - int i; + int i, j, count; agent_info_t *agent_ptr = (agent_info_t *) args; thd_t *thread_ptr = agent_ptr->thread_struct; unsigned long usec = 1250000; @@ -484,6 +486,7 @@ static void *_wdog(void *args) slurm_mutex_lock(&agent_ptr->thread_mutex); for (i = 0; i < agent_ptr->thread_count; i++) { + //info("thread name %s",thread_ptr[i].node_name); if(!thread_ptr[i].ret_list) { _update_wdog_state(&thread_ptr[i], &thread_ptr[i].state, @@ -492,10 +495,14 @@ static void *_wdog(void *args) itr = list_iterator_create( thread_ptr[i].ret_list); while((ret_type = list_next(itr)) != NULL) { - _update_wdog_state( - &thread_ptr[i], - (state_t *)&ret_type->msg_rc, - &thd_comp); + count = list_count(ret_type-> + ret_data_list); + for(j=0; j<count; j++) { + _update_wdog_state( + &thread_ptr[i], + &ret_type->msg_rc, + &thd_comp); + } } list_iterator_destroy(itr); } @@ -518,7 +525,7 @@ static void *_wdog(void *args) if (thread_ptr[i].ret_list) list_destroy(thread_ptr[i].ret_list); } - + if (thd_comp.max_delay) debug2("agent maximum delay %d seconds", thd_comp.max_delay); @@ -817,7 +824,7 @@ static void *_thread_per_group_rpc(void *args) msg.srun_node_id = 0; msg.forward_struct_init = 0; - //info("forwarding to %d",msg.forward.cnt); + //info("%s forwarding to %d",thread_ptr->node_name, msg.forward.cnt); thread_ptr->end_time = thread_ptr->start_time + COMMAND_TIMEOUT; if (task_ptr->get_reply) { send_rc_again: @@ -844,6 +851,9 @@ static void *_thread_per_group_rpc(void *args) strncpy(thread_ptr->node_name, fwd_msg.node_name, MAX_SLURM_NAME); + memcpy(&thread_ptr->slurm_addr, + &fwd_msg.addr, + sizeof(slurm_addr)); goto send_rc_again; } } @@ -854,7 +864,8 @@ static void *_thread_per_group_rpc(void *args) tmp_ret_list = list_create(destroy_ret_types); fwd_msg.header.srun_node_id = msg.srun_node_id; - fwd_msg.header.forward = msg.forward; + forward_init(&fwd_msg.header.forward, &msg.forward); + //fwd_msg.header.forward = msg.forward; fwd_msg.ret_list = tmp_ret_list; strncpy(fwd_msg.node_name, thread_ptr->node_name, @@ -862,11 +873,15 @@ static void *_thread_per_group_rpc(void *args) fwd_msg.forward_mutex = NULL; if(forward_msg_to_next(&fwd_msg, errno)) { msg.address = fwd_msg.addr; - msg.forward = fwd_msg.header.forward; + forward_init(&msg.forward, + &fwd_msg.header.forward); msg.srun_node_id = fwd_msg.header.srun_node_id; strncpy(thread_ptr->node_name, fwd_msg.node_name, MAX_SLURM_NAME); + memcpy(&thread_ptr->slurm_addr, + &fwd_msg.addr, + sizeof(slurm_addr)); goto send_node_again; } } @@ -894,15 +909,17 @@ static void *_thread_per_group_rpc(void *args) data_itr = list_iterator_create(ret_type->ret_data_list); while((ret_data_info = list_next(data_itr)) != NULL) { rc = ret_type->msg_rc; - if(!found - && !strcmp(ret_data_info->node_name,"localhost")) { - //info("got localhost"); - xfree(ret_data_info->node_name); + if(!found && !ret_data_info->node_name) { ret_data_info->node_name = xstrdup(thread_ptr->node_name); + memcpy(&ret_data_info->addr, + &thread_ptr->slurm_addr, + sizeof(slurm_addr)); + /* info("got localhost changing to %s", */ +/* ret_data_info->node_name); */ found = 1; } -/* info("response for %s rc = %d", */ + /* info("response for %s rc = %d", */ /* ret_data_info->node_name, */ /* ret_type->msg_rc); */ if(rc == SLURM_ERROR) { @@ -1006,7 +1023,7 @@ static void *_thread_per_group_rpc(void *args) ret_data_info->node_name); } list_iterator_destroy(data_itr); - if (srun_agent) + if(srun_agent) thread_state = DSH_FAILED; else if(ret_type->type == REQUEST_PING) /* check if a forward failed */ @@ -1024,6 +1041,8 @@ static void *_thread_per_group_rpc(void *args) cleanup: xfree(args); + + /* handled at end of thread just incase resend is needed */ destroy_forward(&msg.forward); slurm_mutex_lock(thread_mutex_ptr); thread_ptr->ret_list = ret_list; @@ -1046,6 +1065,45 @@ static void _alarm_handler(int dummy) xsignal(SIGALRM, _alarm_handler); } +static int _setup_requeue(agent_arg_t *agent_arg_ptr, thd_t *thread_ptr, + int count, int *spot) +{ + ListIterator itr; + ListIterator data_itr; + ret_data_info_t *ret_data_info = NULL; + ret_types_t *ret_type = NULL; + + itr = list_iterator_create(thread_ptr->ret_list); + while((ret_type = list_next(itr)) != NULL) { + debug2("got return type of %d", ret_type->msg_rc); + if (ret_type->msg_rc != DSH_NO_RESP) + continue; + + data_itr = list_iterator_create(ret_type->ret_data_list); + while((ret_data_info = list_next(data_itr)) != NULL) { + debug("got the name %s to resend out of %d", + ret_data_info->node_name, count); + + if(agent_arg_ptr) { + memcpy(&agent_arg_ptr->slurm_addr[*spot], + &ret_data_info->addr, + sizeof(slurm_addr)); + + strncpy(&agent_arg_ptr-> + node_names[(*spot) * MAX_SLURM_NAME], + ret_data_info->node_name, + MAX_SLURM_NAME); + + if ((++(*spot)) == count) + return 1; + } + } + list_iterator_destroy(data_itr); + } + list_iterator_destroy(itr); + return 0; +} + /* * _queue_agent_retry - Queue any failed RPCs for later replay * IN agent_info_ptr - pointer to info on completed agent requests @@ -1057,7 +1115,7 @@ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count) queued_request_t *queued_req_ptr = NULL; thd_t *thread_ptr = agent_info_ptr->thread_struct; int i, j; - + if (count == 0) return; @@ -1074,13 +1132,23 @@ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count) j = 0; for (i = 0; i < agent_info_ptr->thread_count; i++) { - if (thread_ptr[i].state != DSH_NO_RESP) - continue; - agent_arg_ptr->slurm_addr[j] = thread_ptr[i].slurm_addr; - strncpy(&agent_arg_ptr->node_names[j * MAX_SLURM_NAME], - thread_ptr[i].node_name, MAX_SLURM_NAME); - if ((++j) == count) - break; + if(!thread_ptr[i].ret_list) { + if (thread_ptr[i].state != DSH_NO_RESP) + continue; + debug("got the name %s to resend", + thread_ptr[i].node_name); + + agent_arg_ptr->slurm_addr[j] = + thread_ptr[i].slurm_addr; + strncpy(&agent_arg_ptr->node_names[j * MAX_SLURM_NAME], + thread_ptr[i].node_name, MAX_SLURM_NAME); + if ((++j) == count) + break; + } else { + if(_setup_requeue(agent_arg_ptr, &thread_ptr[i], + count, &j)) + break; + } } if (count != j) { error("agent: Retry count (%d) != actual count (%d)", -- GitLab