From 5168cba29e63d730a4cad313e20e05c2f9116b04 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 11 Jan 2007 22:19:34 +0000 Subject: [PATCH] svn merge -r10737:10749 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1 --- NEWS | 3 +++ doc/man/man1/sbcast.1 | 7 ++++--- src/api/pmi_server.c | 2 +- src/api/slurm_pmi.c | 28 ++++++++++++++++++++------- src/common/slurm_protocol_api.c | 2 +- src/plugins/switch/elan/switch_elan.c | 2 +- src/srun/opt.h | 2 +- testsuite/expect/test7.2 | 7 +++++-- 8 files changed, 37 insertions(+), 16 deletions(-) diff --git a/NEWS b/NEWS index 6ae221f4935..7e37f1e8a04 100644 --- a/NEWS +++ b/NEWS @@ -196,6 +196,9 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.1.25 ========================= + - switch/elan: Fix for "Failed to initialise stats structure" from + libelan when ELAN_STATKEY > MAX_INT. + - Tune PMI support logic for better scalability and performance. * Changes in SLURM 1.1.24 ========================= diff --git a/doc/man/man1/sbcast.1 b/doc/man/man1/sbcast.1 index f2f3cf5d892..64bb43c47e0 100644 --- a/doc/man/man1/sbcast.1 +++ b/doc/man/man1/sbcast.1 @@ -10,14 +10,15 @@ sbcast \- transmit a file to the nodes allocated to a SLURM job. \fBsbcast\fR is used to transmit a file to all nodes allocated to the currenly active SLURM job. This command should only be executed from within a SLURM batch -job or within the shell spawned after a SLURM job\'s resource +job or within the shell spawned after a SLURM job's resource allocation. \fBSOURCE\fR is the name of a file on the current node. \fBDEST\fR should be the fully qualified pathname for the file copy to be created on each node. \fBDEST\fR should be on a file system local to that node. -Note that parallel file systems may provide better performance -than \fBsbcast\fR can provide. +Note that parallel file systems \fImay\fR provide better performance +than \fBsbcast\fR can provide, although performance will vary +by file size, degree of parallelism, and network type. .SH "OPTIONS" .TP diff --git a/src/api/pmi_server.c b/src/api/pmi_server.c index 5f35bab3b12..ce603a11c04 100644 --- a/src/api/pmi_server.c +++ b/src/api/pmi_server.c @@ -139,7 +139,7 @@ static void *_msg_thread(void *x) msg_arg_ptr->bar_ptr->port, msg_arg_ptr->bar_ptr->hostname); - timeout = slurm_get_msg_timeout() * 8000; + timeout = slurm_get_msg_timeout() * 10000; if (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) { error("slurm_send_recv_rc_msg_only_one: %m"); } else if (rc != SLURM_SUCCESS) { diff --git a/src/api/slurm_pmi.c b/src/api/slurm_pmi.c index 83125e15bc2..879cf418bc1 100644 --- a/src/api/slurm_pmi.c +++ b/src/api/slurm_pmi.c @@ -49,7 +49,7 @@ #include "src/common/slurm_auth.h" #define MAX_RETRIES 5 -#define PMI_TIME 1000 /* spacing between RPCs, usec */ +#define PMI_TIME 500 /* spacing between RPCs, usec */ int pmi_fd = -1; uint16_t srun_port = 0; @@ -95,10 +95,17 @@ int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr, * the same time and refuse some connections, retry as * needed. Spread out messages by task's rank. Also * increase the timeout if many tasks since the srun - * command is very overloaded. */ + * command is very overloaded. + * We also increase the timeout (default timeout is + * 10 secs). */ usleep(pmi_rank * PMI_TIME); - if (pmi_size > 10) - timeout = slurm_get_msg_timeout() * 8000; + if (pmi_size > 1000) /* 100 secs */ + timeout = slurm_get_msg_timeout() * 10000; + else if (pmi_size > 100) /* 50 secs */ + timeout = slurm_get_msg_timeout() * 5000; + else if (pmi_size > 10) /* 20 secs */ + timeout = slurm_get_msg_timeout() * 2000; + while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) { if (retries++ > MAX_RETRIES) { error("slurm_send_kvs_comm_set: %m"); @@ -164,10 +171,17 @@ int slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr, * the same time and refuse some connections, retry as * needed. Spread out messages by task's rank. Also * increase the timeout if many tasks since the srun - * command is very overloaded. */ + * command is very overloaded. + * We also increase the timeout (default timeout is + * 10 secs). */ usleep(pmi_rank * PMI_TIME); - if (pmi_size > 10) - timeout = slurm_get_msg_timeout() * 8000; + if (pmi_size > 1000) /* 100 secs */ + timeout = slurm_get_msg_timeout() * 10000; + else if (pmi_size > 100) /* 50 secs */ + timeout = slurm_get_msg_timeout() * 5000; + else if (pmi_size > 10) /* 20 secs */ + timeout = slurm_get_msg_timeout() * 2000; + while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) { if (retries++ > MAX_RETRIES) { error("slurm_get_kvs_comm_set: %m"); diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 870182838e1..2421fd29fb4 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -802,7 +802,7 @@ int slurm_receive_msg(slurm_fd fd, slurm_msg_t *msg, int timeout) /* convert secs to msec */ timeout = slurm_get_msg_timeout() * 1000; - if(timeout >= (slurm_get_msg_timeout() * 10000)) { + if(timeout > (slurm_get_msg_timeout() * 10000)) { error("slurm_receive_msg: " "You are sending a message with timeout's greater " "than %d seconds, your's is %d seconds", diff --git a/src/plugins/switch/elan/switch_elan.c b/src/plugins/switch/elan/switch_elan.c index e376292552e..eac9dd21d03 100644 --- a/src/plugins/switch/elan/switch_elan.c +++ b/src/plugins/switch/elan/switch_elan.c @@ -694,7 +694,7 @@ int switch_p_job_attach ( switch_jobinfo_t jobinfo, char ***env, * Tell libelan the key to use for Elan state shmem segment */ if (qsw_statkey ((qsw_jobinfo_t) jobinfo, &id) >= 0) - slurm_setenvpf (env, "ELAN_STATKEY", "0x%x", id); + slurm_setenvpf (env, "ELAN_STATKEY", "%d", id); return SLURM_SUCCESS; } diff --git a/src/srun/opt.h b/src/srun/opt.h index b9bbb5ff8b6..fbca4fa044e 100644 --- a/src/srun/opt.h +++ b/src/srun/opt.h @@ -52,7 +52,7 @@ #include "src/common/env.h" #include "src/srun/fname.h" -#define MAX_THREADS 32 +#define MAX_THREADS 60 #define MAX_USERNAME 9 #define INT_UNASSIGNED ((int)-1) diff --git a/testsuite/expect/test7.2 b/testsuite/expect/test7.2 index be274789df1..270dc834873 100755 --- a/testsuite/expect/test7.2 +++ b/testsuite/expect/test7.2 @@ -43,7 +43,7 @@ print_header $test_id # exec $bin_rm -f $file_prog_get if {![test_aix]} { - exec $bin_cc ${file_prog_get}.c -g -pthread -o $file_prog_get -I${slurm_dir}/include -Wl,--rpath=${slurm_dir}/lib -L${slurm_dir}/lib -lpmi -lslurm + exec $bin_cc ${file_prog_get}.c -g -pthread -o $file_prog_get -I${slurm_dir}/include -L/usr/lib64 -Wl,--rpath=/usr/lib64 -L${slurm_dir}/lib -Wl,--rpath=${slurm_dir}/lib -lpmi -lslurm } else { exec $bin_cc ${file_prog_get}.c -Wl,-brtl -g -pthread -o $file_prog_get -I${slurm_dir}/include -L${slurm_dir}/lib -lpmi -lslurm } @@ -55,11 +55,14 @@ exec $bin_chmod 700 $file_prog_get set timeout $max_job_delay if { [test_bluegene] } { set node_cnt 1-1024 + set task_cnt 8 } else { if { [test_xcpu] } { set node_cnt 1-1 + set task_cnt 8 } else { set node_cnt 1-4 + set task_cnt 8 } } @@ -67,7 +70,7 @@ if { [test_bluegene] } { # Adjust time limits as needed for large task counts */ # times are here vv set timeout [expr $max_job_delay + 60] -set srun_pid [spawn $srun -l -N$node_cnt -n8 -O -t1 --threads=1 $file_prog_get] +set srun_pid [spawn $srun -l -N$node_cnt -n$task_cnt -O -t1 --threads=1 $file_prog_get] expect { -re "FAILURE" { send_user "\nFAILURE: some error occured\n" -- GitLab