diff --git a/NEWS b/NEWS index c962a6b36006da4926668be4f652e4fb141a1f4c..6acb1c7ee9a70fea327e29deeaa74e2e4993b82e 100644 --- a/NEWS +++ b/NEWS @@ -5,16 +5,21 @@ documents those changes that are of interest to users and admins. ============================= -- OpenMPI users only: Add srun logic to automatically recreate and re-launch a job step if the step fails with a reserved port conflict. - -- Add TopologyPlugin configuration parameter. - -- Add switch topology data structure to slurmctld (for use by select plugin) - add load it based upon new slurm.conf parameters: SwitchName, Nodes, - Switches and LinkSpeed. + -- Added TopologyPlugin configuration parameter. + -- Added switch topology data structure to slurmctld (for use by select + plugin) add load it based upon new slurm.conf parameters: SwitchName, + Nodes, Switches and LinkSpeed. -- Modify select/linear and select/cons_res plugins to optimize resource allocation with respect to network topology. - -- Add support for new configuration parameter EpilogSlurmctld (executed by - slurmctld daemon). - -- Add checkpoint/blcr plugin, SLURM now support job checkpoint/restart using - BLCR. Patch from Hongjia Cao, NUDT, China. + -- Added support for new configuration parameter EpilogSlurmctld (executed + by slurmctld daemon). + -- Added checkpoint/blcr plugin, SLURM now support job checkpoint/restart + using BLCR. Patch from Hongjia Cao, NUDT, China. + -- Made a variety of new environment variables available to PrologSlurmctld + and EpilogSlurmctld. See the "Prolog and Epilog Scripts" section of the + slurm.conf man page for details. + -- NOTE: Cold-start (without preserving state) required for upgrade from + version 1.4.0-pre8. * Changes in SLURM 1.4.0-pre8 ============================= diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 6e2b42d2ecf84c885f9f322f71383dc3b0de3d38..171a9609a002e0fb5723b729d07387b9cd435e48 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,5 +1,5 @@ RELEASE NOTES FOR SLURM VERSION 2.0 -10 February 2009 (after SLURM 1.4.0-pre7 released) +11 February 2009 (after SLURM 1.4.0-pre8 released) IMPORTANT NOTE: @@ -46,10 +46,9 @@ HIGHLIGHTS * SLURM has been modified to allocate specific cores to jobs and job steps in the centralized scheduler rather than the daemons running on the individual compute nodes. This permits effective preemption or gang schedule jobs. -* A new configuration parameter, PrologSlurmctld, can be used to support the - booting of different operating systems for each job. See "man slurm.conf" - for details. EpilogSlurmctld has also been added to execution at job - completion. +* New configuration parameters, PrologSlurmctld and EpilogSlurmctld, can be + used to support the booting of different operating systems for each job. + See "man slurm.conf" for details. * Preemption of jobs from lower priority partitions in order to execute jobs in higher priority partitions is now supported. The jobs from the lower priority partition will resume once preempting job completes. For more @@ -62,6 +61,9 @@ HIGHLIGHTS https://computing.llnl.gov/linux/slurm/sun_const.html * Support added for IBM BlueGene/P systems, including High Throughput Computing (HTC) mode. +* Support for checkpoint/restart using BLCR added using the checkpoint/blcr + plugin. For more information see: + https://computing.llnl.gov/linux/slurm/checkpoint_blcr.html CONFIGURATION FILE CHANGES (see "man slurm.conf" for details) * The default AuthType is now "auth/munge" rather than "auth/none". diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index cbf37179bbfa2955d83105b66ffa68078d1aeaab..80d6d0f320b5044eee4fe5df7aa75dd531217e37 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -348,10 +348,9 @@ The default value is "NO". \fBEpilog\fR Fully qualified pathname of a script to execute as user root on every node when a user's job completes (e.g. "/usr/local/slurm/epilog"). This may -be used to purge files, disable user login, etc. By default there is no epilog. -The following environment variables are set for use by the program: -SLURM_JOBID, SLURM_UID plus MPIRUN_PARTITION (only for BlueGene systems) or -BASIL_PARTITION_ID (only for Cray systems running ALPS/BASIL). +be used to purge files, disable user login, etc. +By default there is no epilog. +See \fBProlog and Epilog Scripts\fR for more information. .TP \fBEpilogMsgTime\fR @@ -373,8 +372,7 @@ The program executes as SlurmUser, which gives it permission to drain nodes and requeue the job if a failure occurs or cancel the job if appropriate. The program can be used to reboot nodes or perform other work to prepare resources for use. -The following environment variables are set for use by the program: -SLURM_JOBID, SLURM_UID, SLURM_NODELIST and SLURM_CONSTRAINTS. +See \fBProlog and Epilog Scripts\fR for more information. .TP \fBFastSchedule\fR @@ -857,16 +855,7 @@ whenever it is asked to run a job step from a new job allocation (e.g. the first job step. This may be used to purge files, enable user login, etc. By default there is no prolog. Any configured script is expected to complete execution quickly (in less time than \fBMessageTimeout\fR). -The following environment variables are set for use by the program: -SLURM_JOBID, SLURM_UID plus MPIRUN_PARTITION (only for BlueGene systems) or -BASIL_PARTITION_ID (only for Cray systems running ALPS/BASIL). - -NOTE: The Prolog script is ONLY run on any individual -node when it first sees a job step from a new allocation; it does not -run the Prolog immediately when an allocation is granted. If no job steps -from an allocation are run on a node, it will never run the Prolog for that -allocation. The Epilog, on the other hand, always runs on every node of an -allocation when the allocation is released. +See \fBProlog and Epilog Scripts\fR for more information. .TP \fBPrologSlurmctld\fR @@ -877,10 +866,7 @@ The program executes as SlurmUser, which gives it permission to drain nodes and requeue the job if a failure occurs or cancel the job if appropriate. The program can be used to reboot nodes or perform other work to prepare resources for use. -The available environments for the job would be identified as a -node \fBFeature\fR and specified in the user request as a job constraint. -The following environment variables are set for use by the program: -SLURM_JOBID, SLURM_UID, SLURM_NODELIST and SLURM_CONSTRAINTS. +See \fBProlog and Epilog Scripts\fR for more information. .TP \fBPropagatePrioProcess\fR @@ -2057,6 +2043,73 @@ Recommended only for systems running with gang scheduling State of partition or availability for use. Possible values are "UP" or "DOWN". The default value is "UP". +.SH "Prolog and Epilog Scripts" +There are a variety of prolog and epilog program options that +execute with various permissions and at various times. +The four options most likely to be used are: +\fBProlog\fR and \fBEpilog\fR (executed once on each compute node +for each job) plus \fBPrologSlurmctld\fR and \fBEpilogSlurmctld\fR +(executed once on the \fBControlMachine\fR for each job). + +NOTE: The Prolog script is ONLY run on any individual +node when it first sees a job step from a new allocation; it does not +run the Prolog immediately when an allocation is granted. If no job steps +from an allocation are run on a node, it will never run the Prolog for that +allocation. The Epilog, on the other hand, always runs on every node of an +allocation when the allocation is released. + +Information about the job is passed to the script using environment +variables. +Unless otherwise specified, these environment variables are available +to all of the programs. +.TP +\fBBASIL_RESERVATION_ID\fR +Basil reservation ID. +Available on Cray XT systems only. +.TP +\fBMPIRUN_PARTITION\fR +BlueGene partition name. +Available on BlueGene systems only. +.TP +\fBSLURM_JOB_ACCOUNT\fR +Account name used for the job. +Available in \fBPrologSlurmctld\fR and \fBEpilogSlurmctld\fR only. +.TP +\fBSLURM_JOB_CONSTRAINTS\fR +Features required to run the job. +Available in \fBPrologSlurmctld\fR and \fBEpilogSlurmctld\fR only. +.TP +\fBSLURM_JOB_GID\fR +Group ID of the job's owner. +Available in \fBPrologSlurmctld\fR and \fBEpilogSlurmctld\fR only. +.TP +\fBSLURM_JOB_GROUP\fR +Group name of the job's owner. +Available in \fBPrologSlurmctld\fR and \fBEpilogSlurmctld\fR only. +.TP +\fBSLURM_JOB_ID\fR +Job ID. +.TP +\fBSLURM_JOB_NAME\fR +Name of the job. +Available in \fBPrologSlurmctld\fR and \fBEpilogSlurmctld\fR only. +.TP +\fBSLURM_JOB_NODELIST\fR +Nodes assigned to job. A SLURM hostlist expression. +"scontrol show hostnames" can be used to convert this to a +list of individual host names. +Available in \fBPrologSlurmctld\fR and \fBEpilogSlurmctld\fR only. +.TP +\fBSLURM_JOB_PARTITION\fR +Partition that job runs in. +Available in \fBPrologSlurmctld\fR and \fBEpilogSlurmctld\fR only. +.TP +\fBSLURM_JOB_UID\fR +User ID of the job's owner. +.TP +\fBSLURM_JOB_USER\fR +User name of the job's owner. + .SH "NETWORK TOPOLOGY" SLURM is able to optimze job allocations to minimize network contention. Special SLURM logic is used to optimize allocations on systems with a diff --git a/src/slurmctld/basil_interface.c b/src/slurmctld/basil_interface.c index 1a2d27860a3d53f6160455f59b317b3fbd74c30b..dccf46edcb91b2c4bc5a3a0d4dbeeaed11a21270 100644 --- a/src/slurmctld/basil_interface.c +++ b/src/slurmctld/basil_interface.c @@ -194,7 +194,8 @@ extern int basil_query(void) for (each_basil_reservation) { bool found = false; job_iterator = list_iterator_create(job_list); - while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + while ((job_ptr = (struct job_record *) + list_next(job_iterator))) { select_g_get_jobinfo(job_ptr->select_jobinfo, SELECT_DATA_RESV_ID, &res_id); found = !strcmp(res_id, basil_reservation_id); diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 34065142f9d5227872f9897d912deb6e2d69d326..c4981691a7def23d3f85f31e1d1b07c4f6f5a979 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -54,6 +54,7 @@ #include "src/common/macros.h" #include "src/common/node_select.h" #include "src/common/slurm_accounting_storage.h" +#include "src/common/uid.h" #include "src/common/xassert.h" #include "src/common/xstring.h" @@ -70,6 +71,7 @@ #define _DEBUG 0 #define MAX_RETRIES 10 +static char ** _build_env(struct job_record *job_ptr); static void _depend_list_del(void *dep_ptr); static void _feature_list_delete(void *x); static void * _run_epilog(void *arg); @@ -1077,6 +1079,44 @@ extern int epilog_slurmctld(struct job_record *job_ptr) } } +static char **_build_env(struct job_record *job_ptr) +{ + char **my_env, *name; + + my_env = xmalloc(sizeof(char *)); + my_env[0] = NULL; +#ifdef HAVE_CRAY_XT + select_g_get_jobinfo(job_ptr->select_jobinfo, + SELECT_DATA_RESV_ID, &name); + setenvf(&env, "BASIL_RESERVATION_ID", "%s", name); + xfree(name); +#endif +#ifdef HAVE_BG + select_g_get_jobinfo(job_ptr->select_jobinfo, + SELECT_DATA_BLOCK_ID, &name); + setenvf(&env, "MPIRUN_PARTITION", "%s", name); +#endif + setenvf(&my_env, "SLURM_JOB_ACCOUNT", "%s", job_ptr->account); + if (job_ptr->details) { + setenvf(&my_env, "SLURM_JOB_CONSTRAINTS", + "%s", job_ptr->details->features); + } + setenvf(&my_env, "SLURM_JOB_GID", "%u", job_ptr->group_id); + name = gid_to_string((uid_t) job_ptr->group_id); + setenvf(&my_env, "SLURM_JOB_GROUP", "%s", name); + xfree(name); + setenvf(&my_env, "SLURM_JOB_ID", "%u", job_ptr->job_id); + setenvf(&my_env, "SLURM_JOB_NAME", "%s", job_ptr->name); + setenvf(&my_env, "SLURM_JOB_NODELIST", "%s", job_ptr->nodes); + setenvf(&my_env, "SLURM_JOB_PARTITION", "%s", job_ptr->partition); + setenvf(&my_env, "SLURM_JOB_UID", "%u", job_ptr->user_id); + name = uid_to_string((uid_t) job_ptr->user_id); + setenvf(&my_env, "SLURM_JOB_USER", "%s", name); + xfree(name); + + return my_env; +} + static void *_run_epilog(void *arg) { struct job_record *job_ptr = (struct job_record *) arg; @@ -1091,12 +1131,7 @@ static void *_run_epilog(void *arg) lock_slurmctld(config_read_lock); argv[0] = xstrdup(slurmctld_conf.epilog_slurmctld); argv[1] = NULL; - - my_env = xmalloc(sizeof(char *)); - my_env[0] = NULL; - setenvf(&my_env, "SLURM_JOBID", "%u", job_ptr->job_id); - setenvf(&my_env, "SLURM_NODELIST", "%s", job_ptr->nodes); - setenvf(&my_env, "SLURM_UID", "%u", job_ptr->user_id); + my_env = _build_env(job_ptr); job_id = job_ptr->job_id; unlock_slurmctld(config_read_lock); @@ -1193,16 +1228,7 @@ static void *_run_prolog(void *arg) lock_slurmctld(config_read_lock); argv[0] = xstrdup(slurmctld_conf.prolog_slurmctld); argv[1] = NULL; - - my_env = xmalloc(sizeof(char *)); - my_env[0] = NULL; - if (job_ptr->details && job_ptr->details->features) { - setenvf(&my_env, "SLURM_CONSTRAINTS", - "%s", job_ptr->details->features); - } - setenvf(&my_env, "SLURM_JOBID", "%u", job_ptr->job_id); - setenvf(&my_env, "SLURM_NODELIST", "%s", job_ptr->nodes); - setenvf(&my_env, "SLURM_UID", "%u", job_ptr->user_id); + my_env = _build_env(job_ptr); job_id = job_ptr->job_id; unlock_slurmctld(config_read_lock); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index fbab2ad261313cbd6b0298d7e9e6e0d133e5b470..8cf681a6a814cb3c2ba186cb462e666a3f6e60cd 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1120,8 +1120,8 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, acct_policy_job_begin(job_ptr); - jobacct_storage_g_job_start( - acct_db_conn, slurmctld_cluster_name, job_ptr); + jobacct_storage_g_job_start(acct_db_conn, slurmctld_cluster_name, + job_ptr); prolog_slurmctld(job_ptr); slurm_sched_newalloc(job_ptr); diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index aed37c4aae73e6587b8af5a10ee03e3e05347634..3121fa1aacd3559c6baf37ab41482cab776ebed4 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * src/slurmd/slurmd/req.c - slurmd request handling ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. + * Copyright (C) 2002-2007 The Regents of the University of California. * Copyright (C) 2008-2009 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Mark Grondona <mgrondona@llnl.gov>. @@ -2567,7 +2567,8 @@ _rpc_suspend_job(slurm_msg_t *msg) while ((stepd = list_next(i))) { if (stepd->jobid != req->job_id) { /* multiple jobs expected on shared nodes */ - debug3("Step from other job: jobid=%u (this jobid=%u)", + debug3("Step from other job: jobid=%u " + "(this jobid=%u)", stepd->jobid, req->job_id); continue; } @@ -3048,12 +3049,19 @@ _rpc_update_time(slurm_msg_t *msg) slurm_send_rc_msg(msg, rc); } -/* NOTE: xfree returned value */ +/* NOTE: call _destroy_env() to free returned value */ static char ** _build_env(uint32_t jobid, uid_t uid, char *resv_id) { + char *name; char **env = xmalloc(sizeof(char *)); + env[0] = NULL; + setenvf(&env, "SLURM_JOB_ID", "%u", jobid); + setenvf(&env, "SLURM_JOB_UID", "%u", uid); + name = uid_to_string(uid); + setenvf(&env, "SLURM_JOB_USER", "%s", name); + xfree(name); setenvf(&env, "SLURM_JOBID", "%u", jobid); setenvf(&env, "SLURM_UID", "%u", uid); if (resv_id) {