From 5168cba29e63d730a4cad313e20e05c2f9116b04 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 11 Jan 2007 22:19:34 +0000
Subject: [PATCH] svn merge -r10737:10749
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1

---
 NEWS                                  |  3 +++
 doc/man/man1/sbcast.1                 |  7 ++++---
 src/api/pmi_server.c                  |  2 +-
 src/api/slurm_pmi.c                   | 28 ++++++++++++++++++++-------
 src/common/slurm_protocol_api.c       |  2 +-
 src/plugins/switch/elan/switch_elan.c |  2 +-
 src/srun/opt.h                        |  2 +-
 testsuite/expect/test7.2              |  7 +++++--
 8 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/NEWS b/NEWS
index 6ae221f4935..7e37f1e8a04 100644
--- a/NEWS
+++ b/NEWS
@@ -196,6 +196,9 @@ documents those changes that are of interest to users and admins.
 
 * Changes in SLURM 1.1.25
 =========================
+ - switch/elan: Fix for "Failed to initialise stats structure" from
+   libelan when ELAN_STATKEY > MAX_INT.
+ - Tune PMI support logic for better scalability and performance.
 
 * Changes in SLURM 1.1.24
 =========================
diff --git a/doc/man/man1/sbcast.1 b/doc/man/man1/sbcast.1
index f2f3cf5d892..64bb43c47e0 100644
--- a/doc/man/man1/sbcast.1
+++ b/doc/man/man1/sbcast.1
@@ -10,14 +10,15 @@ sbcast \- transmit a file to the nodes allocated to a SLURM job.
 \fBsbcast\fR is used to transmit a file to all nodes allocated 
 to the currenly active SLURM job.
 This command should only be executed from within a SLURM batch
-job or within the shell spawned after a SLURM job\'s resource 
+job or within the shell spawned after a SLURM job's resource 
 allocation. 
 \fBSOURCE\fR is the name of a file on the current node.
 \fBDEST\fR should be the fully qualified pathname for the 
 file copy to be created on each node. 
 \fBDEST\fR should be on a file system local to that node.
-Note that parallel file systems may provide better performance 
-than \fBsbcast\fR can provide.
+Note that parallel file systems \fImay\fR provide better performance 
+than \fBsbcast\fR can provide, although performance will vary 
+by file size, degree of parallelism, and network type.
 
 .SH "OPTIONS"
 .TP
diff --git a/src/api/pmi_server.c b/src/api/pmi_server.c
index 5f35bab3b12..ce603a11c04 100644
--- a/src/api/pmi_server.c
+++ b/src/api/pmi_server.c
@@ -139,7 +139,7 @@ static void *_msg_thread(void *x)
 		msg_arg_ptr->bar_ptr->port,
 		msg_arg_ptr->bar_ptr->hostname);
 
-	timeout = slurm_get_msg_timeout() * 8000;
+	timeout = slurm_get_msg_timeout() * 10000;
 	if (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
 		error("slurm_send_recv_rc_msg_only_one: %m");
 	} else if (rc != SLURM_SUCCESS) {
diff --git a/src/api/slurm_pmi.c b/src/api/slurm_pmi.c
index 83125e15bc2..879cf418bc1 100644
--- a/src/api/slurm_pmi.c
+++ b/src/api/slurm_pmi.c
@@ -49,7 +49,7 @@
 #include "src/common/slurm_auth.h"
 
 #define MAX_RETRIES 5
-#define PMI_TIME    1000	/* spacing between RPCs, usec */
+#define PMI_TIME    500	/* spacing between RPCs, usec */
 
 int pmi_fd = -1;
 uint16_t srun_port = 0;
@@ -95,10 +95,17 @@ int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr,
 	 * the same time and refuse some connections, retry as 
 	 * needed. Spread out messages by task's rank. Also 
 	 * increase the timeout if many tasks since the srun 
-	 * command is very overloaded. */
+	 * command is very overloaded.
+	 * We also increase the timeout (default timeout is
+	 * 10 secs). */
 	usleep(pmi_rank * PMI_TIME);
-	if (pmi_size > 10)
-		timeout = slurm_get_msg_timeout() * 8000;
+	if      (pmi_size > 1000)	/* 100 secs */
+		timeout = slurm_get_msg_timeout() * 10000;
+	else if (pmi_size > 100)	/* 50 secs */
+		timeout = slurm_get_msg_timeout() * 5000;
+	else if (pmi_size > 10)		/* 20 secs */
+		timeout = slurm_get_msg_timeout() * 2000;
+
 	while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
 		if (retries++ > MAX_RETRIES) {
 			error("slurm_send_kvs_comm_set: %m");
@@ -164,10 +171,17 @@ int  slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr,
 	 * the same time and refuse some connections, retry as 
 	 * needed. Spread out messages by task's rank. Also
 	 * increase the timeout if many tasks since the srun
-	 * command is very overloaded. */
+	 * command is very overloaded.
+	 * We also increase the timeout (default timeout is
+	 * 10 secs). */
 	usleep(pmi_rank * PMI_TIME);
-	if (pmi_size > 10)
-		timeout = slurm_get_msg_timeout() * 8000;
+	if      (pmi_size > 1000)	/* 100 secs */
+		timeout = slurm_get_msg_timeout() * 10000;
+	else if (pmi_size > 100)	/* 50 secs */
+		timeout = slurm_get_msg_timeout() * 5000;
+	else if (pmi_size > 10)		/* 20 secs */
+		timeout = slurm_get_msg_timeout() * 2000;
+
 	while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
 		if (retries++ > MAX_RETRIES) {
 			error("slurm_get_kvs_comm_set: %m");
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index 870182838e1..2421fd29fb4 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -802,7 +802,7 @@ int slurm_receive_msg(slurm_fd fd, slurm_msg_t *msg, int timeout)
 		/* convert secs to msec */
                 timeout  = slurm_get_msg_timeout() * 1000; 
 
-	if(timeout >= (slurm_get_msg_timeout() * 10000)) {
+	if(timeout > (slurm_get_msg_timeout() * 10000)) {
 		error("slurm_receive_msg: "
 		      "You are sending a message with timeout's greater "
 		      "than %d seconds, your's is %d seconds", 
diff --git a/src/plugins/switch/elan/switch_elan.c b/src/plugins/switch/elan/switch_elan.c
index e376292552e..eac9dd21d03 100644
--- a/src/plugins/switch/elan/switch_elan.c
+++ b/src/plugins/switch/elan/switch_elan.c
@@ -694,7 +694,7 @@ int switch_p_job_attach ( switch_jobinfo_t jobinfo, char ***env,
 	 * Tell libelan the key to use for Elan state shmem segment
 	 */
 	if (qsw_statkey ((qsw_jobinfo_t) jobinfo, &id) >= 0)
-		slurm_setenvpf (env, "ELAN_STATKEY", "0x%x", id);
+		slurm_setenvpf (env, "ELAN_STATKEY", "%d", id);
 
 	return SLURM_SUCCESS;
 }
diff --git a/src/srun/opt.h b/src/srun/opt.h
index b9bbb5ff8b6..fbca4fa044e 100644
--- a/src/srun/opt.h
+++ b/src/srun/opt.h
@@ -52,7 +52,7 @@
 #include "src/common/env.h"
 #include "src/srun/fname.h"
 
-#define MAX_THREADS	32
+#define MAX_THREADS	60
 #define MAX_USERNAME	9
 
 #define INT_UNASSIGNED ((int)-1)
diff --git a/testsuite/expect/test7.2 b/testsuite/expect/test7.2
index be274789df1..270dc834873 100755
--- a/testsuite/expect/test7.2
+++ b/testsuite/expect/test7.2
@@ -43,7 +43,7 @@ print_header $test_id
 #
 exec $bin_rm -f $file_prog_get
 if {![test_aix]} {
-	exec $bin_cc ${file_prog_get}.c -g -pthread -o $file_prog_get -I${slurm_dir}/include -Wl,--rpath=${slurm_dir}/lib -L${slurm_dir}/lib -lpmi -lslurm
+	exec $bin_cc ${file_prog_get}.c -g -pthread -o $file_prog_get -I${slurm_dir}/include -L/usr/lib64 -Wl,--rpath=/usr/lib64 -L${slurm_dir}/lib -Wl,--rpath=${slurm_dir}/lib -lpmi -lslurm
 } else {
 	exec $bin_cc ${file_prog_get}.c -Wl,-brtl -g -pthread -o $file_prog_get -I${slurm_dir}/include -L${slurm_dir}/lib -lpmi -lslurm
 }
@@ -55,11 +55,14 @@ exec $bin_chmod 700 $file_prog_get
 set timeout $max_job_delay
 if { [test_bluegene] } {
 	set node_cnt 1-1024
+	set task_cnt 8
 } else {
 	if { [test_xcpu] } {
 		set node_cnt 1-1
+		set task_cnt 8
 	} else {
 		set node_cnt 1-4
+		set task_cnt 8
 	}
 }
 
@@ -67,7 +70,7 @@ if { [test_bluegene] } {
 # Adjust time limits as needed for large task counts */
 #                   times are here  vv 
 set timeout [expr $max_job_delay +  60]
-set srun_pid [spawn $srun -l -N$node_cnt -n8 -O -t1 --threads=1 $file_prog_get]
+set srun_pid [spawn $srun -l -N$node_cnt -n$task_cnt -O -t1 --threads=1 $file_prog_get]
 expect {
 	-re "FAILURE" {
 		send_user "\nFAILURE: some error occured\n"
-- 
GitLab