From d2911e0a64d068830c8cf94d51d1057c34fb8c1e Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Mon, 24 Jul 2006 23:22:09 +0000
Subject: [PATCH] svn merge -r8670:8680
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1

---
 NEWS                            |  2 +-
 src/slurmctld/agent.c           | 35 +++++++++++++++++++++++++++++++--
 src/srun/pmi.c                  | 18 +++++++++++++----
 testsuite/expect/test7.2.prog.c |  8 ++++----
 4 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/NEWS b/NEWS
index 08e42e6e23a..e41c089bebc 100644
--- a/NEWS
+++ b/NEWS
@@ -1783,7 +1783,7 @@ documents those changes that are of interest to users and admins.
  -- "fatal: _shm_unlock: Numerical result out of range" bug fixed in slurmd.
  -- Config file parsing is now case insensitive.
  -- SLURM_NODELIST environment variable now set in allocate mode.
-  
+ 
 * Changes in SLURM 0.2.0-pre2
 =============================
 
diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 78c36d2eadd..ef25a028b57 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -1208,15 +1208,30 @@ extern int agent_retry (int min_wait)
 	int list_size = 0;
 	time_t now = time(NULL);
 	queued_request_t *queued_req_ptr = NULL;
+	agent_arg_t *agent_arg_ptr;
+	ListIterator retry_iter;
 
 	if (retry_list) {
 		static time_t last_msg_time = (time_t) 0;
+		uint32_t msg_type[5], i = 0;
 		list_size = list_count(retry_list);
 		if ((list_size > MAX_AGENT_CNT) 
 		&&  (difftime(now, last_msg_time) > 300)) {
 			/* Note sizable backlog of work */
 			info("WARNING: agent retry_list size is %d", 
 				list_size);
+			retry_iter = list_iterator_create(retry_list);
+			while ((queued_req_ptr = (queued_request_t *) 
+					list_next(retry_iter))) {
+				agent_arg_ptr = queued_req_ptr->agent_arg_ptr;
+				msg_type[i++] = agent_arg_ptr->msg_type;
+				if (i == 5)
+					break;
+			}
+			list_iterator_destroy(retry_iter);
+			info("   retry_list msg_type=%u,%u,%u,%u,%u", 
+				msg_type[0], msg_type[1], msg_type[2],
+				msg_type[3], msg_type[4]);
 			last_msg_time = now;
 		}
 	}
@@ -1225,10 +1240,26 @@ extern int agent_retry (int min_wait)
 
 	slurm_mutex_lock(&retry_mutex);
 	if (retry_list) {
-		ListIterator retry_iter;
+		/* first try to find a new (never tried) record */
+
+		retry_iter = list_iterator_create(retry_list);
+		while ((queued_req_ptr = (queued_request_t *)
+				list_next(retry_iter))) {
+ 			if (queued_req_ptr->last_attempt == 0)
+				list_remove(retry_iter);
+				list_size--;
+				break;
+			}
+		list_iterator_destroy(retry_iter);
+	}
+
+	if (retry_list && (queued_req_ptr == NULL)) {
+		/* now try to find a requeue request that is 
+		 * relatively old */
 		double age = 0;
 
 		retry_iter = list_iterator_create(retry_list);
+		/* next try to find an older record to retry */
 		while ((queued_req_ptr = (queued_request_t *) 
 				list_next(retry_iter))) {
 			age = difftime(now, queued_req_ptr->last_attempt);
@@ -1243,7 +1274,7 @@ extern int agent_retry (int min_wait)
 	slurm_mutex_unlock(&retry_mutex);
 
 	if (queued_req_ptr) {
-		agent_arg_t *agent_arg_ptr = queued_req_ptr->agent_arg_ptr;
+		agent_arg_ptr = queued_req_ptr->agent_arg_ptr;
 		xfree(queued_req_ptr);
 		if (agent_arg_ptr)
 			_spawn_retry_agent(agent_arg_ptr);
diff --git a/src/srun/pmi.c b/src/srun/pmi.c
index 289275d3cf4..bf40d48b71b 100644
--- a/src/srun/pmi.c
+++ b/src/srun/pmi.c
@@ -47,6 +47,7 @@
 /* Global variables */
 pthread_mutex_t kvs_mutex = PTHREAD_MUTEX_INITIALIZER;
 int kvs_comm_cnt = 0;
+int kvs_updated = 0;
 struct kvs_comm **kvs_comm_ptr = NULL;
 
 struct barrier_resp {
@@ -95,15 +96,23 @@ static void _kvs_xmit_tasks(void)
 #if _DEBUG
 	info("All tasks at barrier, transmit KVS keypairs now");
 #endif
-	/* copy the data */
+	/* reset barrier info */
 	args = xmalloc(sizeof(struct agent_arg));
 	args->barrier_xmit_ptr = barrier_ptr;
 	args->barrier_xmit_cnt = barrier_cnt;
 	barrier_ptr = NULL;
 	barrier_resp_cnt = 0;
 	barrier_cnt = 0;
-	args->kvs_xmit_ptr = _kvs_comm_dup();
-	args->kvs_xmit_cnt = kvs_comm_cnt;
+
+	/* copy the new kvs data */
+	if (kvs_updated) {
+		args->kvs_xmit_ptr = _kvs_comm_dup();
+		args->kvs_xmit_cnt = kvs_comm_cnt;
+		kvs_updated = 0;
+	} else {	/* No new data to transmit */
+		args->kvs_xmit_ptr = xmalloc(0);
+		args->kvs_xmit_cnt = 0;
+	}
 
 	/* Spawn a pthread to transmit it */
 	slurm_attr_init(&attr);
@@ -316,12 +325,13 @@ extern int pmi_kvs_put(struct kvs_comm_set *kvs_set_ptr)
 			_merge_named_kvs(kvs_ptr, 
 				kvs_set_ptr->kvs_comm_ptr[i]);
 		} else {
-			_move_kvs(kvs_set_ptr-> kvs_comm_ptr[i]);
+			_move_kvs(kvs_set_ptr->kvs_comm_ptr[i]);
 			kvs_set_ptr-> kvs_comm_ptr[i] = NULL;
 		}
 	}
 	slurm_free_kvs_comm_set(kvs_set_ptr);
 	_print_kvs();
+	kvs_updated = 1;
 	pthread_mutex_unlock(&kvs_mutex);
 	return SLURM_SUCCESS;
 }
diff --git a/testsuite/expect/test7.2.prog.c b/testsuite/expect/test7.2.prog.c
index ad46a923498..a73c636b3fa 100644
--- a/testsuite/expect/test7.2.prog.c
+++ b/testsuite/expect/test7.2.prog.c
@@ -34,11 +34,11 @@
 #else
 /* Typical MVAPICH2 use
  *
- * Typically takes very long time for large task count
- * adjust job time limit and timeout in test7.2 as needed
+ * Adjust job time limit and timeout in test7.2 as needed
+ * for large values.
  */
-#  define BARRIER_CNT           7
-#  define PUTS_PER_BARRIER     32
+#  define BARRIER_CNT           4
+#  define PUTS_PER_BARRIER      0
 #endif
 
 #define OFFSET_1  1234
-- 
GitLab