From 94e863de168a201a4b72aa08e70dd83836766ae0 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Thu, 13 Jul 2006 19:14:45 +0000
Subject: [PATCH] svn merge -r8549:8565
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1

---
 src/common/forward.c             |   5 ++
 src/common/hostlist.c            |  75 +++++++++++++++-----
 src/common/slurm_protocol_api.c  |   4 +-
 src/common/slurm_protocol_defs.h |   1 +
 src/common/slurm_protocol_pack.c |   3 +
 src/sbcast/agent.c               |   5 +-
 src/sinfo/opts.c                 |  14 +++-
 src/slurmctld/agent.c            | 114 ++++++++++++++++++++++++-------
 8 files changed, 176 insertions(+), 45 deletions(-)

diff --git a/src/common/forward.c b/src/common/forward.c
index e922ba27640..817d1a59db2 100644
--- a/src/common/forward.c
+++ b/src/common/forward.c
@@ -497,6 +497,8 @@ extern int forward_msg_to_next(forward_msg_t *fwd_msg, int err)
 	if(fwd_msg->ret_list) {
 		ret_data_info = xmalloc(sizeof(ret_data_info_t));
 		ret_data_info->node_name = xstrdup(fwd_msg->node_name);
+		memcpy(&ret_data_info->addr, &fwd_msg->addr, 
+		       sizeof(slurm_addr));
 		ret_data_info->nodeid = fwd_msg->header.srun_node_id;
 		itr = list_iterator_create(fwd_msg->ret_list);	
 		while((type = (ret_types_t *) list_next(itr)) != NULL) {
@@ -747,6 +749,8 @@ extern int no_resp_forwards(forward_t *forward, List *ret_list, int err)
 			&forward->name[i * MAX_SLURM_NAME], 
 			MAX_SLURM_NAME);
 		ret_data_info->node_name = xstrdup(name);
+		memcpy(&ret_data_info->addr, &forward->addr[i], 
+		       sizeof(slurm_addr));
 		ret_data_info->nodeid = forward->node_id[i];
 	}
 no_forward:
@@ -769,6 +773,7 @@ void destroy_forward(forward_t *forward)
 		xfree(forward->name);
 		xfree(forward->node_id);
 		forward->cnt = 0;
+		forward->init = 0;
 	}
 }
 
diff --git a/src/common/hostlist.c b/src/common/hostlist.c
index c02420ff26c..8010b1a3d9a 100644
--- a/src/common/hostlist.c
+++ b/src/common/hostlist.c
@@ -519,7 +519,10 @@ static int _width_equiv(unsigned long n, int *wn, unsigned long m, int *wm)
  */
 static size_t host_prefix_end(const char *hostname)
 {
-	size_t idx = strlen(hostname) - 1;
+	size_t idx; 
+	if (!hostname)
+		return -1;
+	idx = strlen(hostname) - 1;
 
 	while (idx >= 0 && isdigit((char) hostname[idx])) 
 		idx--;
@@ -598,6 +601,8 @@ static void hostname_destroy(hostname_t hn)
  */
 static int hostname_suffix_is_valid(hostname_t hn)
 {
+	if (!hn)
+		return false;
 	return hn->suffix != NULL;
 }
 
@@ -605,6 +610,8 @@ static int hostname_suffix_is_valid(hostname_t hn)
  */
 static int hostname_suffix_width(hostname_t hn)
 {
+	if (!hn)
+		return -1;
 	assert(hn->suffix != NULL);
 	return (int) strlen(hn->suffix);
 }
@@ -768,12 +775,12 @@ static int hostrange_cmp(hostrange_t h1, hostrange_t h2)
 
 /* compare the prefixes of two hostrange objects. 
  * returns:
- *    < 0   if h1 prefix is less than h2 OR h1 == NULL.
+ *    < 0   if h1 prefix is less than h2 OR h2 == NULL.
  *
  *      0   if h1's prefix and h2's prefix match, 
  *          UNLESS, either h1 or h2 (NOT both) do not have a valid suffix.
  *
- *    > 0   if h1's prefix is greater than h2's OR h2 == NULL. */
+ *    > 0   if h1's prefix is greater than h2's OR h1 == NULL. */
 static int hostrange_prefix_cmp(hostrange_t h1, hostrange_t h2)
 {
 	int retval;
@@ -982,6 +989,8 @@ hostrange_to_string(hostrange_t hr, size_t n, char *buf, char *separator)
 
 	if (n == 0)
 		return 0;
+	
+	assert(hr != NULL);
 
 	if (hr->singlehost)
 		return snprintf(buf, n, "%s", hr->prefix);
@@ -1017,6 +1026,7 @@ static size_t hostrange_numstr(hostrange_t hr, size_t n, char *buf)
 	int len = 0;
 
 	assert(buf != NULL);
+	assert(hr != NULL);
 
 	if (hr->singlehost || n == 0)
 		return 0;
@@ -1245,8 +1255,11 @@ hostlist_t _hostlist_create(const char *hostlist, char *sep, char *r_op)
 
 	hostlist_t new = hostlist_new();
 
-	orig = str = strdup(hostlist);
+	if (hostlist == NULL)
+		return new;
 
+	orig = str = strdup(hostlist);
+	
 	/* return an empty list if an empty string was passed in */
 	if (str == NULL || strlen(str) == 0)
 		goto done;
@@ -1356,7 +1369,8 @@ hostlist_t _hostlist_create(const char *hostlist, char *sep, char *r_op)
 	}
 
   done:
-	free(orig);
+	if(orig)
+		free(orig);
 
 	return new;
 }
@@ -1412,7 +1426,7 @@ static int _parse_single_range(const char *str, struct _range *range)
 	free(orig);
 	range->width = strlen(str);
 	return 1;
-
+	
   error:
     errno = EINVAL;
 	_error(__FILE__, __LINE__, "Invalid range: `%s'", orig);
@@ -1499,6 +1513,7 @@ _push_range_list(hostlist_t hl, char *pfx, struct _range *rng,
 	int n)
 {
 	int i;
+	
 	for (i = 0; i < n; i++) {
 		hostlist_push_hr(hl, pfx, rng->lo, rng->hi, rng->width);
 		rng++;
@@ -1571,7 +1586,7 @@ hostlist_t hostlist_copy(const hostlist_t hl)
 	int i;
 	hostlist_t new;
 
-	if (hl == NULL)
+	if (!hl)
 		return NULL;
 
 	LOCK_HOSTLIST(hl);
@@ -1595,7 +1610,7 @@ hostlist_t hostlist_copy(const hostlist_t hl)
 void hostlist_destroy(hostlist_t hl)
 {
 	int i;
-	if (hl == NULL)
+	if (!hl)
 		return;
 	LOCK_HOSTLIST(hl);
 	while (hl->ilist) {
@@ -1617,7 +1632,7 @@ int hostlist_push(hostlist_t hl, const char *hosts)
 {
 	hostlist_t new;
 	int retval;
-	if (hosts == NULL)
+	if (!hosts || !hl)
 		return 0;
 	new = hostlist_create(hosts);
 	if (!new)
@@ -1635,7 +1650,7 @@ int hostlist_push_host(hostlist_t hl, const char *str)
 	hostrange_t hr;
 	hostname_t hn;
 
-	if (str == NULL)
+	if (!str || !hl)
 		return 0;
 
 	hn = hostname_create(str);
@@ -1658,7 +1673,7 @@ int hostlist_push_list(hostlist_t h1, hostlist_t h2)
 {
 	int i, n = 0;
 
-	if (h2 == NULL)
+	if (!h2 || !h1)
 		return 0;
 
 	LOCK_HOSTLIST(h2);
@@ -1675,7 +1690,11 @@ int hostlist_push_list(hostlist_t h1, hostlist_t h2)
 char *hostlist_pop(hostlist_t hl)
 {
 	char *host = NULL;
-
+	if(!hl) {
+		error("hostlist_pop: no hoslist given");
+		return NULL;
+	}
+	
 	LOCK_HOSTLIST(hl);
 	if (hl->nhosts > 0) {
 		hostrange_t hr = hl->hr[hl->nranges - 1];
@@ -1696,6 +1715,10 @@ static void
 hostlist_shift_iterators(hostlist_t hl, int idx, int depth, int n)
 {
 	hostlist_iterator_t i;
+	if(!hl) {
+		error("hostlist_shift_iterators: no hoslist given");
+		return;
+	}
 	for (i = hl->ilist; i; i = i->next) {
 		if (n == 0) {
 			if (i->idx == idx && i->depth >= depth)
@@ -1715,6 +1738,10 @@ char *hostlist_shift(hostlist_t hl)
 {
 	char *host = NULL;
 
+	if(!hl){
+		error("hostlist_shift: no hoslist given");
+		return NULL;
+	}
 	LOCK_HOSTLIST(hl);
 
 	if (hl->nhosts > 0) {
@@ -1743,6 +1770,8 @@ char *hostlist_pop_range(hostlist_t hl)
 	hostlist_t hltmp;
 	hostrange_t tail;
 
+	if(!hl)
+		return NULL;
 	LOCK_HOSTLIST(hl);
 	if (hl->nranges < 1 || !(hltmp = hostlist_new())) {
 		UNLOCK_HOSTLIST(hl);
@@ -1774,7 +1803,7 @@ char *hostlist_shift_range(hostlist_t hl)
 	int i;
 	char buf[1024];
 	hostlist_t hltmp = hostlist_new();
-	if (!hltmp)
+	if (!hltmp || !hl)
 		return NULL;
 
 	LOCK_HOSTLIST(hl);
@@ -1816,7 +1845,9 @@ int hostlist_delete(hostlist_t hl, const char *hosts)
 	int n = 0;
 	char *hostname = NULL;
 	hostlist_t hltmp;
-
+	if(!hl)
+		return -1;
+	
 	if (!(hltmp = hostlist_create(hosts)))
 		seterrno_ret(EINVAL, 0);
 
@@ -1833,7 +1864,12 @@ int hostlist_delete(hostlist_t hl, const char *hosts)
 /* XXX watch out! poor implementation follows! (fix it at some point) */
 int hostlist_delete_host(hostlist_t hl, const char *hostname)
 {
-	int n = hostlist_find(hl, hostname);
+	int n;
+
+	if(!hl)
+		return -1;
+	n = hostlist_find(hl, hostname);
+
 	if (n >= 0)
 		hostlist_delete_nth(hl, n);
 	return n >= 0 ? 1 : 0;
@@ -1857,6 +1893,8 @@ char * hostlist_nth(hostlist_t hl, int n)
 	char *host = NULL;
 	int   i, count;
 
+	if(!hl)
+		return NULL;
 	LOCK_HOSTLIST(hl);
 	count = 0;
 	for (i = 0; i < hl->nranges; i++) {
@@ -1879,6 +1917,8 @@ int hostlist_delete_nth(hostlist_t hl, int n)
 {
 	int i, count;
 
+	if(!hl)
+		return -1;
 	LOCK_HOSTLIST(hl);
 	assert(n >= 0 && n <= hl->nhosts);
 
@@ -1915,6 +1955,9 @@ int hostlist_delete_nth(hostlist_t hl, int n)
 int hostlist_count(hostlist_t hl)
 {
 	int retval;
+	if(!hl)
+		return -1;
+
 	LOCK_HOSTLIST(hl);
 	retval = hl->nhosts;
 	UNLOCK_HOSTLIST(hl);
@@ -1926,7 +1969,7 @@ int hostlist_find(hostlist_t hl, const char *hostname)
 	int i, count, ret = -1;
 	hostname_t hn;
 
-	if (!hostname)
+	if (!hostname || !hl)
 		return -1;
 
 	hn = hostname_create(hostname);
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index 20c42298edf..7baabf5edae 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -1578,7 +1578,7 @@ static List _send_recv_rc_msg(slurm_fd fd, slurm_msg_t *req, int timeout)
 	} 
 	
 	ret_data_info = xmalloc(sizeof(ret_data_info_t));
-	ret_data_info->node_name = xstrdup("localhost");
+	ret_data_info->node_name = NULL;
 	ret_data_info->data = NULL;
 	debug3("got reply for %s rc %d %d", 
 	       ret_data_info->node_name, 
@@ -1674,7 +1674,7 @@ failed:
 	}
 	
 	ret_data_info = xmalloc(sizeof(ret_data_info_t));
-	ret_data_info->node_name = xstrdup("localhost");
+	ret_data_info->node_name = NULL;
 	ret_data_info->data = NULL;
 	itr = list_iterator_create(ret_list);		
 	while((ret_type = list_next(itr)) != NULL) {
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index cf10651182a..47b686dc63a 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -252,6 +252,7 @@ typedef struct slurm_msg {
 
 typedef struct ret_data_info {
 	char *node_name;
+	slurm_addr addr;       
 	uint32_t nodeid;
 	void *data;
 } ret_data_info_t;
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index d24e89dbf1a..7e2e82eec37 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -3121,6 +3121,7 @@ _pack_ret_list(List ret_list,
 		itr_data = list_iterator_create(ret_type->ret_data_list);
 		while((ret_data_info = list_next(itr_data)) != NULL) {
 			packstr(ret_data_info->node_name, buffer);
+			slurm_pack_slurm_addr(&ret_data_info->addr, buffer);
 			pack32((uint32_t)ret_data_info->nodeid, buffer);
 			msg.data = ret_data_info->data;
 			pack_msg(&msg, buffer);
@@ -3156,6 +3157,8 @@ _unpack_ret_list(List *ret_list,
 			ret_data_info = xmalloc(sizeof(ret_data_info_t));
 			safe_unpackstr_xmalloc(&ret_data_info->node_name, 
 					       &uint16_tmp, buffer);
+			slurm_unpack_slurm_addr_no_alloc(&ret_data_info->addr,
+							 buffer);
 			safe_unpack32((uint32_t *)&ret_data_info->nodeid, 
 				      buffer);
 			if (unpack_msg(&msg, buffer) != SLURM_SUCCESS)
diff --git a/src/sbcast/agent.c b/src/sbcast/agent.c
index d2a52fde1e0..22d292188c5 100644
--- a/src/sbcast/agent.c
+++ b/src/sbcast/agent.c
@@ -88,11 +88,10 @@ static void *_agent_thread(void *args)
 		while ((ret_data_info = list_next(data_itr)) != NULL) {
 			if (ret_type->msg_rc == SLURM_SUCCESS)
 				continue;
-			if (!strcmp(ret_data_info->node_name,
-					"localhost")) {
-				xfree(ret_data_info->node_name);
+			if (!ret_data_info->node_name) {
 				ret_data_info->node_name = 
 					xstrdup(thread_ptr->node_name);
+				ret_data_info->addr = msg->address;
 			}
 			error("REQUEST_FILE_BCAST(%s): %s",
 				ret_data_info->node_name,
diff --git a/src/sinfo/opts.c b/src/sinfo/opts.c
index d6d0d6848ff..a7002ee7f9b 100644
--- a/src/sinfo/opts.c
+++ b/src/sinfo/opts.c
@@ -73,6 +73,7 @@ extern void parse_command_line(int argc, char *argv[])
 	char *env_val = NULL;
 	int opt_char;
 	int option_index;
+	hostlist_t host_list;
 	static struct option long_options[] = {
 		{"all",       no_argument,       0, 'a'},
 		{"bg",        no_argument,       0, 'b'},
@@ -136,7 +137,8 @@ extern void parse_command_line(int argc, char *argv[])
 		case (int) 'i':
 			params.iterate= atoi(optarg);
 			if (params.iterate <= 0) {
-				error ("Error: --iterate=%s");
+				error ("Error: invalid entry for "
+				       "--iterate=%s", optarg);
 				exit(1);
 			}
 			break;
@@ -145,6 +147,16 @@ extern void parse_command_line(int argc, char *argv[])
 			break;
 		case (int) 'n':
 			params.nodes= xstrdup(optarg);
+			/*
+			 * confirm valid nodelist entry
+			 */
+			host_list = hostlist_create(params.nodes);
+			if (!host_list) {
+				error("'%s' invalid entry for --nodes",
+				      optarg);
+				exit(1);
+			}
+			hostlist_destroy(host_list);
 			break;
 		case (int) 'N':
 			params.node_flag = true;
diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 07d7861a9c8..2d47e1ddfde 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -161,6 +161,8 @@ static void _notify_slurmctld_nodes(agent_info_t *agent_ptr,
 		int no_resp_cnt, int retry_cnt);
 static void _purge_agent_args(agent_arg_t *agent_arg_ptr);
 static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count);
+static int _setup_requeue(agent_arg_t *agent_arg_ptr, thd_t *thread_ptr, 
+			  int count, int *spot);
 static void _slurmctld_free_job_launch_msg(batch_job_launch_msg_t * msg);
 static void _spawn_retry_agent(agent_arg_t * agent_arg_ptr);
 static void *_thread_per_group_rpc(void *args);
@@ -455,7 +457,7 @@ static void _update_wdog_state(thd_t *thread_ptr,
 static void *_wdog(void *args)
 {
 	bool srun_agent = false;
-	int i;
+	int i, j, count;
 	agent_info_t *agent_ptr = (agent_info_t *) args;
 	thd_t *thread_ptr = agent_ptr->thread_struct;
 	unsigned long usec = 1250000;
@@ -484,6 +486,7 @@ static void *_wdog(void *args)
 
 		slurm_mutex_lock(&agent_ptr->thread_mutex);
 		for (i = 0; i < agent_ptr->thread_count; i++) {
+			//info("thread name %s",thread_ptr[i].node_name);
 			if(!thread_ptr[i].ret_list) {
 				_update_wdog_state(&thread_ptr[i],
 						   &thread_ptr[i].state,
@@ -492,10 +495,14 @@ static void *_wdog(void *args)
 				itr = list_iterator_create(
 					thread_ptr[i].ret_list);
 				while((ret_type = list_next(itr)) != NULL) {
-					_update_wdog_state(
-						&thread_ptr[i],
-						(state_t *)&ret_type->msg_rc,
-						&thd_comp);
+					count = list_count(ret_type->
+							   ret_data_list);
+					for(j=0; j<count; j++) {
+						_update_wdog_state(
+							&thread_ptr[i],
+							&ret_type->msg_rc,
+							&thd_comp);
+					}
 				}
 				list_iterator_destroy(itr);
 			}
@@ -518,7 +525,7 @@ static void *_wdog(void *args)
 		if (thread_ptr[i].ret_list)
 			list_destroy(thread_ptr[i].ret_list);
 	}
-
+	
 	if (thd_comp.max_delay)
 		debug2("agent maximum delay %d seconds", thd_comp.max_delay);
 	
@@ -817,7 +824,7 @@ static void *_thread_per_group_rpc(void *args)
 	msg.srun_node_id = 0;
 	msg.forward_struct_init = 0;
 
-	//info("forwarding to %d",msg.forward.cnt);
+	//info("%s forwarding to %d",thread_ptr->node_name, msg.forward.cnt);
 	thread_ptr->end_time = thread_ptr->start_time + COMMAND_TIMEOUT;
 	if (task_ptr->get_reply) {
 	send_rc_again:
@@ -844,6 +851,9 @@ static void *_thread_per_group_rpc(void *args)
 				strncpy(thread_ptr->node_name,
 					fwd_msg.node_name,
 					MAX_SLURM_NAME);
+				memcpy(&thread_ptr->slurm_addr, 
+				       &fwd_msg.addr, 
+				       sizeof(slurm_addr));	
 				goto send_rc_again;
 			}
 		}
@@ -854,7 +864,8 @@ static void *_thread_per_group_rpc(void *args)
 				tmp_ret_list = list_create(destroy_ret_types);
 			
 			fwd_msg.header.srun_node_id = msg.srun_node_id;
-			fwd_msg.header.forward = msg.forward;
+			forward_init(&fwd_msg.header.forward, &msg.forward);
+			//fwd_msg.header.forward = msg.forward;
 			fwd_msg.ret_list = tmp_ret_list;
 			strncpy(fwd_msg.node_name,
 				thread_ptr->node_name,
@@ -862,11 +873,15 @@ static void *_thread_per_group_rpc(void *args)
 			fwd_msg.forward_mutex = NULL;
 			if(forward_msg_to_next(&fwd_msg, errno)) {
 				msg.address = fwd_msg.addr;
-				msg.forward = fwd_msg.header.forward;
+				forward_init(&msg.forward, 
+					     &fwd_msg.header.forward);
 				msg.srun_node_id = fwd_msg.header.srun_node_id;
 				strncpy(thread_ptr->node_name,
 					fwd_msg.node_name,
 					MAX_SLURM_NAME);
+				memcpy(&thread_ptr->slurm_addr, 
+				       &fwd_msg.addr, 
+				       sizeof(slurm_addr));
 				goto send_node_again;
 			}
 		} 
@@ -894,15 +909,17 @@ static void *_thread_per_group_rpc(void *args)
 		data_itr = list_iterator_create(ret_type->ret_data_list);
 		while((ret_data_info = list_next(data_itr)) != NULL) {
 			rc = ret_type->msg_rc;
-			if(!found 
-			   && !strcmp(ret_data_info->node_name,"localhost")) {
-			  //info("got localhost");
-				xfree(ret_data_info->node_name);
+			if(!found && !ret_data_info->node_name) {
 				ret_data_info->node_name = 
 					xstrdup(thread_ptr->node_name);
+				memcpy(&ret_data_info->addr, 
+				       &thread_ptr->slurm_addr, 
+				       sizeof(slurm_addr));
+				/* info("got localhost changing to %s", */
+/* 				     ret_data_info->node_name); */
 				found = 1;
 			}
-/* 			info("response for %s rc = %d", */
+			/* info("response for %s rc = %d", */
 /* 			     ret_data_info->node_name, */
 /* 			     ret_type->msg_rc); */
 			if(rc == SLURM_ERROR) {
@@ -1006,7 +1023,7 @@ static void *_thread_per_group_rpc(void *args)
 						ret_data_info->node_name);
 				}
 			list_iterator_destroy(data_itr);
-			if (srun_agent)
+			if(srun_agent)
 				thread_state = DSH_FAILED;
 			else if(ret_type->type == REQUEST_PING)
 				/* check if a forward failed */
@@ -1024,6 +1041,8 @@ static void *_thread_per_group_rpc(void *args)
 
 cleanup:
 	xfree(args);
+	
+	/* handled at end of thread just incase resend is needed */
 	destroy_forward(&msg.forward);
 	slurm_mutex_lock(thread_mutex_ptr);
 	thread_ptr->ret_list = ret_list;
@@ -1046,6 +1065,45 @@ static void _alarm_handler(int dummy)
 	xsignal(SIGALRM, _alarm_handler);
 }
 
+static int _setup_requeue(agent_arg_t *agent_arg_ptr, thd_t *thread_ptr, 
+			  int count, int *spot)
+{
+	ListIterator itr;
+	ListIterator data_itr;
+	ret_data_info_t *ret_data_info = NULL;
+	ret_types_t *ret_type = NULL;
+
+	itr = list_iterator_create(thread_ptr->ret_list);
+	while((ret_type = list_next(itr)) != NULL) {
+		debug2("got return type of %d", ret_type->msg_rc);
+		if (ret_type->msg_rc != DSH_NO_RESP)
+			continue;
+
+		data_itr = list_iterator_create(ret_type->ret_data_list);
+		while((ret_data_info = list_next(data_itr)) != NULL) {
+			debug("got the name %s to resend out of %d", 
+			      ret_data_info->node_name, count);
+				
+			if(agent_arg_ptr) {
+				memcpy(&agent_arg_ptr->slurm_addr[*spot], 
+				       &ret_data_info->addr, 
+				       sizeof(slurm_addr));
+			
+				strncpy(&agent_arg_ptr->
+					node_names[(*spot) * MAX_SLURM_NAME],
+					ret_data_info->node_name, 
+					MAX_SLURM_NAME);
+				
+				if ((++(*spot)) == count)
+					return 1;
+			}
+		}
+		list_iterator_destroy(data_itr);
+	}
+	list_iterator_destroy(itr);
+	return 0;
+}
+
 /*
  * _queue_agent_retry - Queue any failed RPCs for later replay
  * IN agent_info_ptr - pointer to info on completed agent requests
@@ -1057,7 +1115,7 @@ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count)
 	queued_request_t *queued_req_ptr = NULL;
 	thd_t *thread_ptr = agent_info_ptr->thread_struct;
 	int i, j;
-
+	
 	if (count == 0)
 		return;
 
@@ -1074,13 +1132,23 @@ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count)
 
 	j = 0;
 	for (i = 0; i < agent_info_ptr->thread_count; i++) {
-		if (thread_ptr[i].state != DSH_NO_RESP)
-			continue;
-		agent_arg_ptr->slurm_addr[j] = thread_ptr[i].slurm_addr;
-		strncpy(&agent_arg_ptr->node_names[j * MAX_SLURM_NAME],
-			thread_ptr[i].node_name, MAX_SLURM_NAME);
-		if ((++j) == count)
-			break;
+		if(!thread_ptr[i].ret_list) {
+			if (thread_ptr[i].state != DSH_NO_RESP)
+				continue;
+			debug("got the name %s to resend", 
+			      thread_ptr[i].node_name);
+			
+			agent_arg_ptr->slurm_addr[j] = 
+				thread_ptr[i].slurm_addr;
+			strncpy(&agent_arg_ptr->node_names[j * MAX_SLURM_NAME],
+				thread_ptr[i].node_name, MAX_SLURM_NAME);
+			if ((++j) == count)
+				break;
+		} else {
+			if(_setup_requeue(agent_arg_ptr, &thread_ptr[i], 
+					  count, &j))
+				break;
+		}
 	}
 	if (count != j) {
 		error("agent: Retry count (%d) != actual count (%d)", 
-- 
GitLab