diff --git a/NEWS b/NEWS index 259ce99919bddc46e723d0f6ad66651982155d57..6103472fa808f0c2c09c5a9b3d92c0234cfde06e 100644 --- a/NEWS +++ b/NEWS @@ -82,6 +82,12 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.1.16 ========================= + - BLUEGENE - fix to make prolog run 5 minutes longer to make sure we have + enough time to free the overlapping blocks when starting a new job on a + block. + - BLUEGENE - edit to the libsched_if.so to read env and look at + MPIRUN_PARTITION to see if we are in slurm or running mpirun natively. + - Plugins are now dlopened RTLD_LAZY instead of RTLD_NOW. * Changes in SLURM 1.1.15 ========================= diff --git a/doc/html/slurm.shtml b/doc/html/slurm.shtml index 3426f75f89418996db7c7e588dd7f334e2d67b07..61a422bc219590f7150136c22bb64633983c8f9d 100644 --- a/doc/html/slurm.shtml +++ b/doc/html/slurm.shtml @@ -33,12 +33,13 @@ add functionality.</li> <p>SLURM provides resource management on about 1000 computers world-wide including many of the most powerful computers in the world including: <ul> -<li><a href="http://www.llnl.gov/asci/platforms/bluegenel/">BlueGene/L</a> with 65,536 -dual-processor compute nodes</li> -<li><a href="http://www.llnl.gov/linux/thunder/">Thunder</a> a Linux cluster with 1024 -nodes, each having four Itanium2 processors</li> -<li><a href="http://www.llnl.gov/asci/platforms/purple/">ASC Purple</a> an IBM SP/AIX -cluster with 1500 nodes, each having eight Power5 processors</li> +<li><a href="http://www.llnl.gov/asc/computing_resources/bluegenel/bluegene_home.html">BlueGene/L</a> +with 65,536 dual-processor compute nodes</li> +<li><a href="http://www.llnl.gov/asc/computing_resources/purple/purple_index.html">ASC Purple</a> +an IBM SP/AIX cluster with 1532 nodes each having eight Power5 processors</li> +<li>Peloton +with 1152 nodes each having four sockets with dual core Opteron processors and an +InfiniBand switch</li> </ul> There are about 150 downloads of SLURM per month from LLNL's FTP server alone. As of September 2006, SLURM has been downloaded over 3500 times to over 500 @@ -46,6 +47,6 @@ distinct sites in 38 countries. SLURM is also distributed and supported by <a href="http://www.hp.com"> Hewlett-Packard</a> as the resource manager in their XC System Software.</p> -<p style="text-align:center;">Last modified 29 September 2006</p> +<p style="text-align:center;">Last modified 19 October 2006</p> <!--#include virtual="footer.txt"--> diff --git a/src/common/plugin.c b/src/common/plugin.c index 1940986537797b19f3ad7cc6c47bf958c250cf47..dc7c6b02e0ac702f31898d5123345698e3d405b4 100644 --- a/src/common/plugin.c +++ b/src/common/plugin.c @@ -109,14 +109,16 @@ plugin_load_from_file( const char *fq_path ) int (*init)( void ); /* - * Try to open the shared object. We have a choice of trying to - * resolve all the symbols (in both directions) now or when the - * symbols are first dereferenced and used. While it's slower to - * do it this way, it's a lot easier to debug. If you get an - * error somewhere down the line, you're likely to think it's - * some condition that happened then instead of way back here. + * Try to open the shared object. + * + * Use RTLD_LAZY to allow plugins to use symbols that may be + * defined in only one slurm entity (e.g. srun and not slurmd), + * when the use of that symbol is restricted to within the + * entity from which it is available. (i.e. srun symbols are only + * used in the context of srun, not slurmd.) + * */ - plug = dlopen( fq_path, RTLD_NOW ); + plug = dlopen( fq_path, RTLD_LAZY ); if ( plug == NULL ) { error( "plugin_load_from_file: dlopen(%s): %s", fq_path, diff --git a/src/plugins/sched/wiki2/msg.c b/src/plugins/sched/wiki2/msg.c index 1ea7d91aeec82d7dbd76f937b7e145b2caa9879f..40916cffa5351fa3799cbc4a0a6099a093022811 100644 --- a/src/plugins/sched/wiki2/msg.c +++ b/src/plugins/sched/wiki2/msg.c @@ -389,7 +389,8 @@ static int _parse_msg(char *msg, char **req) if (delta_t > 300) { err_code = -350; err_msg = "TS value too far from NOW"; - error("wiki: TS delta_t=%u", delta_t); + error("wiki: TimeStamp too far from NOW (%u secs)", + delta_t); return -1; } diff --git a/src/plugins/select/bluegene/plugin/libsched_if64.c b/src/plugins/select/bluegene/plugin/libsched_if64.c index b64740eb288194f33b4baf77ba4c2a9c26ea71da..37c24e5399d03ee3c9b3992a585148d9c0133205 100644 --- a/src/plugins/select/bluegene/plugin/libsched_if64.c +++ b/src/plugins/select/bluegene/plugin/libsched_if64.c @@ -37,11 +37,20 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include <stdio.h> +#include <stdlib.h> +#include <string.h> int get_parameters(void *params) { - printf("YOU ARE OUTSIDE OF SLURM!!!! NOT RUNNING MPIRUN!\n"); - return -1; + char *partition = getenv("MPIRUN_PARTITION"); /* get MPIRUN env + * var to see if we + * are inside slurm + * or not */ + if (!partition || (strlen(partition) < 3)) { + printf("YOU ARE OUTSIDE OF SLURM!!!! NOT RUNNING MPIRUN!\n"); + return 1; + } + return 2; } void mpirun_done(int res) diff --git a/src/plugins/select/bluegene/plugin/slurm_epilog.c b/src/plugins/select/bluegene/plugin/slurm_epilog.c index 3e9b168ddb9f8822ed46bff6ab23b80c60077aec..b5f5bef4b3b2656f75e11e127abdeb317b0c5be8 100644 --- a/src/plugins/select/bluegene/plugin/slurm_epilog.c +++ b/src/plugins/select/bluegene/plugin/slurm_epilog.c @@ -19,7 +19,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than diff --git a/src/plugins/select/bluegene/plugin/slurm_prolog.c b/src/plugins/select/bluegene/plugin/slurm_prolog.c index 95c932f236c913b97ddb15557e10ab5ddf37c0b9..25ef0fa4e6ff4ae0c311974b54ffecebd48da841 100644 --- a/src/plugins/select/bluegene/plugin/slurm_prolog.c +++ b/src/plugins/select/bluegene/plugin/slurm_prolog.c @@ -19,7 +19,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -61,17 +61,18 @@ /* * Check the bgblock's status every POLL_SLEEP seconds. - * Retry for a period of MIN_DELAY + + * Retry for a period of MIN_FREE_PERVIOUS_BLOCK_DELAY + MIN_DELAY + * (INCR_DELAY * POLL_SLEEP * base partition count). * For example if MIN_DELAY=300 and INCR_DELAY=20 and POLL_SLEEP=3, * wait up to 1260 seconds. * For a 16 base partition bgblock to be ready (300 + (20 * 3 * 16). */ #define POLL_SLEEP 3 /* retry interval in seconds */ +#define MIN_FREE_PERVIOUS_BLOCK_DELAY 300 /* time in seconds */ #define MIN_DELAY 300 /* time in seconds */ #define INCR_DELAY 20 /* time in seconds per BP */ -int max_delay = MIN_DELAY; +int max_delay = MIN_DELAY + MIN_FREE_PERVIOUS_BLOCK_DELAY; int cur_delay = 0; enum rm_partition_state {RM_PARTITION_FREE, @@ -114,7 +115,8 @@ static int _wait_part_ready(uint32_t job_id) { int is_ready = 0, i, rc; - max_delay = MIN_DELAY + (INCR_DELAY * _get_job_size(job_id)); + max_delay = MIN_DELAY + MIN_FREE_PERVIOUS_BLOCK_DELAY + + (INCR_DELAY * _get_job_size(job_id)); #if _DEBUG printf("Waiting for job %u to become ready.", job_id);