diff --git a/NEWS b/NEWS index 3dde492b6914f316869fbad4bd18a787b4f67dc9..dc84c5ee5ef144e7ba1457abc8630e6ca8c8d7fa 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 0.4.0-pre4 ============================= + -- Add support for MPICH-GM (from takao.hatazaki@hp.com) + -- Add support for NodeHostname in node configuration -- Make "scontrol show daemons" function properly on front-end system (e.g. Blue Gene) diff --git a/doc/html/bluegene.html b/doc/html/bluegene.html index a68b056fd9e8a6ec7ddc491cd52b091a22485096..1a04583efb8b640c048de6ffadc541d822f6dc74 100644 --- a/doc/html/bluegene.html +++ b/doc/html/bluegene.html @@ -9,7 +9,7 @@ <meta http-equiv="keywords" content="Simple Linux Utility for Resource Management, SLURM, resource management, Linux clusters, high-performance computing, Livermore Computing"> <meta name="LLNLRandR" content="UCRL-WEB-204324"> -<meta name="LLNLRandRdate" content="8 October 2004"> +<meta name="LLNLRandRdate" content="19 October 2004"> <meta name="distribution" content="global"> <meta name="description" content="Simple Linux Utility for Resource Management"> <meta name="copyright" @@ -143,8 +143,8 @@ a a a a . . . # Z <h3>System Administration</h3> <p>Building a Blue Gene compatible system is dependent upon the <i>configure</i> -program locating some expected files. You should see "#define HAVE_BGL 1" in -the "config.h" file before making SLURM.</p> +program locating some expected files. You should see "#define HAVE_BGL 1" and +"#define HAVE_FRONT_END 1" in the "config.h" file before making SLURM.</p> <p>The slurmctld daemon should execute on the system's service node with an optional backup daemon on one of the front end nodes. @@ -167,9 +167,14 @@ to schedule all of them in priority order. </p> <p>SLURM node and partition descriptions should make use of the <a href="#naming">naming</a> conventions described above. For example, -"NodeName=bgl[000x733] NodeAddr=frontend0 Procs=1024". -Note that the NodeAddr value for all 128 base partitions is the name -of the front end node executing the slurmd daemon.</p> +"NodeName=bgl[000x733] NodeAddr=frontend0 NodeHostname=frontend0 Procs=1024". +Note that the values of both NodeAddr and NodeHostname for all +128 base partitions is the name of the front end node executing +the slurmd daemon. +The NodeName values represent base partitions. +No computers are actually expected to return a value of "bgl000" +in response to the <i>hostname</i> command nor will any attempt +be made to route message traffic to this address. </p> <p>While users are unable to initiate SLURM job steps on Blue Gene systems, this restriction does not apply to user root or SlurmUser. @@ -207,7 +212,7 @@ rebooting of c-nodes and I/O nodes.</p> <td colspan="3"><hr> <p>For information about this page, contact <a href="mailto:slurm-dev@lists.llnl.gov">slurm-dev@lists.llnl.gov</a>.</p> <p><a href="http://www.llnl.gov/"><img align=middle src="lll.gif" width="32" height="32" border="0"></a></p> <p class="footer">UCRL-WEB-207187<br> -Last modified 8 October 2004</p></td> +Last modified 19 October 2004</p></td> </tr> </table> </td> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 659689f125118bcfd4ef00b60877b6d334af7b7b..b3b779a3e6f6bfe56704bf71df226f2d8519fdca 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -140,6 +140,14 @@ authentication by Slurm daemons. Fully qualified pathname of a file containing a public key used for authentication by Slurm daemons. .TP +\fBKillTree\fR +If set to "1", signals (e.g. Ctrl-C or scancel) are forwarded to all descendant +processes of one that was directly invoked by the user. This is always +required if \fBMpichGmDirectSupport\fR is set to "1". The default behavior +is that signals are forwarded to processes that belong to the process group +of the process that was directly invoked by the user. +NOTE: This option is not currently supported on AIX systems. +.TP \fBKillWait\fR The interval, in seconds, given to a job's processes between the SIGTERM and SIGKILL signals upon reaching its time limit. @@ -163,6 +171,11 @@ SLURM's active database. Set the values of \fBMaxJobCount\fR and its memory or other resources. The default value is 300 seconds. A value of zero prevents any job record purging. .TP +\fBMpichGmDirectSupport\fR +If set to "1", srun handles executable files linked with the MPICH-GM +library directly, not via mpirun that uses rsh. If set, \fBKillTree\fR +must also be set to "1". +.TP \fBPluginDir\fR Identifies the places in which to look for SLURM plugins. This is a colon-separated list of directories, like the PATH @@ -374,54 +387,62 @@ information as shown in the example. A job step's tasks are allocated to nodes in order the nodes appear in the configuration file. There is presently no capability within SLURM to arbitarily order a job step's tasks. -The node configuration specifies the following information: -.TP -\fBNodeName\fR -Name of a node as returned by the hostname command, -without the full domain name (e.g. "lx0012"). -A simple node range expression may optionally -be used to specify ranges -of nodes to avoid building a configuration file with large numbers -of entries. The node range expression can contain one +.LP +A simple node range expression may optionally be used to specify +ranges of nodes to avoid building a configuration file with large +numbers of entries. The node range expression can contain one pair of square brackets with a sequence of comma separated numbers and/or ranges of numbers separated by a "-" (e.g. "linux[0-64,128]", or "lx[15,18,32-33]"). -If the NodeName is "DEFAULT", the values specified +Presently the numeric range must be the last characters in the +node name (e.g. "unit[0-31]rack1" is invalid). +The node configuration specified the following information: +.TP +\fBNodeName\fR +Name that SLURM uses to refer to a node. +Typically this would be the string that "/bin/hostname -s" +returns, however it may be an arbitary string if +\fBNodeHostname\fR is specified. +If the \fBNodeName\fR is "DEFAULT", the values specified with that record will apply to subsequent node specifications unless explicitly set to other values in that node record or replaced with a different set of default values. For architectures in which the node order is significant, nodes will be considered consecutive in the order defined. -For example, if the configuration for NodeName=charlie immediately -follows the configuration for NodeName=baker they will be +For example, if the configuration for "NodeName=charlie" immediately +follows the configuration for "NodeName=baker" they will be considered adjacent in the computer. .TP -\fBFeature\fR -A comma delimited list of arbitrary strings indicative of some -characteristic associated with the node. -There is no value associated with a feature at this time, a node -either has a feature or it does not. -If desired a feature may contain a numeric component indicating, -for example, processor speed. -By default a node has no features. +\fBNodeHostname\fR +The string that "/bin/hostname -s" returns. +A node range expression can be used to specify a set of nodes. +If an expression is used, the number of nodes identified by +\fBNodeHostname\fR on a line in the configuration file must +be identical to the number of nodes identified by \fBNodeName\fR. +By default, the \fBNodeHostname\fR will be identical in value to +\fBNodeName\fR. .TP \fBNodeAddr\fR Name that a node should be referred to in establishing -a communications path. This name will be used as an +a communications path. +This name will be used as an argument to the gethostbyname() function for identification. -For example, "elx0012" might be used to designate -the ethernet address for node "lx0012". A simple node range -expression may optionally be used to specify ranges -of nodes. The node range expression can contain one -pair of square brackets with a sequence of comma separated -numbers and/or ranges of numbers separated by a "-" -(e.g. "elinux[0-64,128]"). If a node range expression is used to designate multiple nodes, they must exactly match the entries in the \fBNodeName\fR (e.g. "NodeName=lx[0-7] NodeAddr="elx[0-7]"). -By default the \fBNodeAddr\fR will be identical in value to +\fBNodeAddr\fR may also contain IP addresses. +By default, the \fBNodeAddr\fR will be identical in value to \fBNodeName\fR. .TP +\fBFeature\fR +A comma delimited list of arbitrary strings indicative of some +characteristic associated with the node. +There is no value associated with a feature at this time, a node +either has a feature or it does not. +If desired a feature may contain a numeric component indicating, +for example, processor speed. +By default a node has no features. +.TP \fBRealMemory\fR Size of real memory on the node in MegaBytes (e.g. "2048"). The default value is 1. diff --git a/etc/slurm.conf.example b/etc/slurm.conf.example index 8824a3bc4161877978f0569bb2f6f784095921d6..1cbcb4aef4c9dfb8d4149e0c033d575566f3e6e8 100644 --- a/etc/slurm.conf.example +++ b/etc/slurm.conf.example @@ -264,13 +264,27 @@ # have in its active database at one time. Set the # values of MaxJobCount and MinJobAge so as to avoid # having slurmctld exhaust its memory or other resources. +# "MpichGmDirectSupport" +# : if set to `1', srun handles executable files linked +# with MPICH-GM library directly, not via mpirun that +# uses rsh. +# "KillTree" : if set to `1', signals (e.g. Ctrl-C or scancel) are +# forwarded to all descendant processes of one that was +# directly invoked by the user. This is always +# required if MpichGmDirectSupport is set to 1. The +# default behavior is that signals are forwarded to +# processes that belong to the process group of the +# process that was directly invoked by the user. +# # Example: # -# FastSchedule=0 # default is `1' -# FirstJobid=1000 # default is `1' -# HashBase=8 # default is `10' -# ReturnToService=1 # default is `0' +# FastSchedule=0 # default is `1' +# FirstJobid=1000 # default is `1' +# HashBase=8 # default is `10' +# ReturnToService=1 # default is `0' # MaxJobCount=10000 # Defaults to 2000 +# MpichGmDirectSupport=1 # default is `0' +# KillTree=1 # default is `0' # @@ -334,11 +348,20 @@ # specifies a node or set of nodes to be managed by SLURM. # The special NodeName of "DEFAULT" may be used to establish # default node configuration parameters for subsequent node -# records. +# records. Typically this would be the string that +# `/bin/hostname -s` would return on the node. However +# NodeName may be an arbitrary string if NodeHostname is +# used (see below). # # "Feature" : comma separated list of "features" for the given node(s) # -# "NodeAddr" : preferred address for contacting the node +# "NodeAddr" : preferred address for contacting the node. This may be +# either a name or IP address. +# +# "NodeHostname" +# : the string that `/bin/hostname -s` would return on the +# node. In other words, NodeName may be the name other than +# the real hostname. # # "RealMemory" : Amount of real memory (in Megabytes) # diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 9333e15b00a5fb6cfcbf03f8cf18af08af3d29b2..e1bf756067743396f9819d3f61f2e8ee6a49071d 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -487,11 +487,13 @@ typedef struct slurm_ctl_conf { * inactive resource allocation is released */ char *job_comp_type; /* job completion logger type */ char *job_comp_loc; /* job completion logging location */ + uint16_t kill_tree; /* Kill all descendants processes */ uint16_t kill_wait; /* seconds between SIGXCPU to SIGKILL * on job termination */ uint16_t max_job_cnt; /* maximum number of active jobs */ uint16_t min_job_age; /* COMPLETED jobs over this age (secs) * purged from in memory records */ + uint16_t mpich_gm_dir; /* MPICH-GM direct support */ char *plugindir; /* pathname to plugins */ char *prolog; /* pathname of job prolog */ uint16_t ret2service; /* 1 return DOWN node to service at diff --git a/src/api/config_info.c b/src/api/config_info.c index 72fe910845f46c9bea44b96a43a42c0036d30183..ee52ec1c98f0dc312b6f106f4b86869a0afe30c6 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -95,12 +95,16 @@ void slurm_print_ctl_conf ( FILE* out, slurm_ctl_conf_ptr->job_credential_private_key); fprintf(out, "JobCredPublicKey = %s\n", slurm_ctl_conf_ptr->job_credential_public_certificate); + fprintf(out, "KillTree = %u\n", + slurm_ctl_conf_ptr->kill_tree); fprintf(out, "KillWait = %u\n", slurm_ctl_conf_ptr->kill_wait); fprintf(out, "MaxJobCnt = %u\n", slurm_ctl_conf_ptr->max_job_cnt); fprintf(out, "MinJobAge = %u\n", slurm_ctl_conf_ptr->min_job_age); + fprintf(out, "MpichGmDirectSupport = %u\n", + slurm_ctl_conf_ptr->mpich_gm_dir); fprintf(out, "PluginDir = %s\n", slurm_ctl_conf_ptr->plugindir); fprintf(out, "Prolog = %s\n", diff --git a/src/common/read_config.c b/src/common/read_config.c index 9b6153ec1c5c5f4713f19591ef965b4ee7e1e74d..4e0de4497048a8762b1f069e1dc313c5c320c85a 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -41,6 +41,7 @@ #include <slurm/slurm.h> +#include "src/common/hostlist.h" #include "src/common/slurm_protocol_defs.h" #include "src/common/log.h" #include "src/common/macros.h" @@ -58,6 +59,193 @@ static int _parse_node_spec (char *in_line); static int _parse_part_spec (char *in_line); +typedef struct names_ll_s { + char *node_hostname; + char *node_name; + struct names_ll_s *next; +} names_ll_t; +#define NAME_HASH_LEN 512 +static names_ll_t *host_to_node_hashtbl[NAME_HASH_LEN] = {NULL}; +static names_ll_t *node_to_host_hashtbl[NAME_HASH_LEN] = {NULL}; + +static void _free_name_hashtbl() +{ + int i; + names_ll_t *p, *q; + + for (i=0; i<NAME_HASH_LEN; i++) { + p = host_to_node_hashtbl[i]; + while (p) { + xfree(p->node_hostname); + xfree(p->node_name); + q = p->next; + xfree(p); + p = q; + } + host_to_node_hashtbl[i] = NULL; + p = node_to_host_hashtbl[i]; + while (p) { + xfree(p->node_hostname); + xfree(p->node_name); + q = p->next; + xfree(p); + p = q; + } + node_to_host_hashtbl[i] = NULL; + } +} + +static void _init_name_hashtbl() +{ + return; +} + +static int _get_hash_idx(char *s) +{ + int i; + + i = 0; + while (*s) i += (int)*s++; + return i % NAME_HASH_LEN; +} + +static void _push_to_hashtbl(char *node, char *host) +{ + int idx; + names_ll_t *p, *new; + char *hh; + + hh = host ? host : node; + idx = _get_hash_idx(hh); +#ifndef HAVE_FRONT_END /* Operate only on front-end */ + p = host_to_node_hashtbl[idx]; + while (p) { + if (strcmp(p->node_hostname, hh)==0) { + fatal("Duplicated NodeHostname in the config file"); + return; + } + p = p->next; + } +#endif + new = (names_ll_t *)xmalloc(sizeof(*new)); + new->node_hostname = xstrdup(hh); + new->node_name = xstrdup(node); + new->next = host_to_node_hashtbl[idx]; + host_to_node_hashtbl[idx] = new; + + idx = _get_hash_idx(node); + p = node_to_host_hashtbl[idx]; + while (p) { + if (strcmp(p->node_name, node)==0) { + fatal("Duplicated NodeName in the config file"); + return; + } + p = p->next; + } + new = (names_ll_t *)xmalloc(sizeof(*new)); + new->node_name = xstrdup(node); + new->node_hostname = xstrdup(hh); + new->next = node_to_host_hashtbl[idx]; + node_to_host_hashtbl[idx] = new; +} + +/* + * Register the given NodeName in the alias table. + * If node_hostname is NULL, only node_name will be used and + * no lookup table record is created. + */ +extern void register_conf_node_aliases(char *node_name, char *node_hostname) +{ + hostlist_t node_list = NULL, host_list = NULL; + char *hn = NULL, *nn; + static char *me = NULL; + + if (node_hostname == NULL + || node_name == NULL || *node_name == '\0') + return; + if (strcasecmp(node_name, "DEFAULT") == 0) { + if (node_hostname) { + fatal("NodeHostname for NodeName=DEFAULT is illegal"); + } + return; + } + if (!me) { + me = xmalloc(MAX_NAME_LEN); + getnodename(me, MAX_NAME_LEN); + } + if (strcasecmp(node_name, "localhost") == 0) + node_name = me; + if (node_hostname && (strcasecmp(node_hostname, "localhost") == 0)) + node_hostname = me; + + node_list = hostlist_create(node_name); +#ifdef HAVE_FRONT_END /* Common NodeHostname for all NodeName values */ + /* Expect one common node_hostname for all back-end nodes */ + hn = node_hostname; +#else + if (node_hostname && *node_hostname != '\0') { + host_list = hostlist_create(node_hostname); + if (hostlist_count(node_list) != hostlist_count(host_list)) + fatal("NodeName and NodeHostname have different " + "number of records"); + } +#endif + while ((nn = hostlist_shift(node_list))) { + if (host_list) + hn = hostlist_shift(host_list); + _push_to_hashtbl(nn, hn); + if (host_list) + free(hn); + free(nn); + } + hostlist_destroy(node_list); + if (host_list) + hostlist_destroy(host_list); + + return; +} + +/* + * get_conf_node_hostname - Return the NodeHostname for given NodeName + */ +extern char *get_conf_node_hostname(char *node_name) +{ + int idx; + names_ll_t *p; + + idx = _get_hash_idx(node_name); + p = node_to_host_hashtbl[idx]; + while (p) { + if (strcmp(p->node_name, node_name) == 0) { + return xstrdup(p->node_hostname); + } + p = p->next; + } + return xstrdup(node_name); +} + +/* + * get_conf_node_name - Return the NodeName for given NodeHostname + */ +extern char *get_conf_node_name(char *node_hostname) +{ + int idx; + names_ll_t *p; + + idx = _get_hash_idx(node_hostname); + p = host_to_node_hashtbl[idx]; + while (p) { + if (strcmp(p->node_hostname, node_hostname) == 0) { + return xstrdup(p->node_name); + } + p = p->next; + } + return xstrdup(node_hostname); +} + + + + /* getnodename - equivalent to gethostname, but return only the first * component of the fully qualified name * (e.g. "linux123.foo.bar" becomes "linux123") @@ -119,6 +307,8 @@ free_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->state_save_location); xfree (ctl_conf_ptr->switch_type); xfree (ctl_conf_ptr->tmp_fs); + + _free_name_hashtbl(); } /* @@ -146,9 +336,11 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->job_comp_type); xfree (ctl_conf_ptr->job_credential_private_key); xfree (ctl_conf_ptr->job_credential_public_certificate); + ctl_conf_ptr->kill_tree = (uint16_t) NO_VAL; ctl_conf_ptr->kill_wait = (uint16_t) NO_VAL; ctl_conf_ptr->max_job_cnt = (uint16_t) NO_VAL; ctl_conf_ptr->min_job_age = (uint16_t) NO_VAL; + ctl_conf_ptr->mpich_gm_dir = (uint16_t) NO_VAL; xfree (ctl_conf_ptr->plugindir); xfree (ctl_conf_ptr->prolog); ctl_conf_ptr->ret2service = (uint16_t) NO_VAL; @@ -173,6 +365,10 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->switch_type); xfree (ctl_conf_ptr->tmp_fs); ctl_conf_ptr->wait_time = (uint16_t) NO_VAL; + + _free_name_hashtbl(); + _init_name_hashtbl(); + return; } @@ -201,6 +397,7 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) int slurmctld_debug = -1, slurmd_debug = -1; int max_job_cnt = -1, min_job_age = -1, wait_time = -1; int slurmctld_port = -1, slurmd_port = -1; + int mpich_gm_dir = -1, kill_tree = -1; char *backup_addr = NULL, *backup_controller = NULL; char *checkpoint_type = NULL, *control_addr = NULL; char *control_machine = NULL, *epilog = NULL; @@ -236,9 +433,11 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) "JobCredentialPrivateKey=", 's', &job_credential_private_key, "JobCredentialPublicCertificate=", 's', &job_credential_public_certificate, + "KillTree=", 'd', &kill_tree, "KillWait=", 'd', &kill_wait, "MaxJobCount=", 'd', &max_job_cnt, "MinJobAge=", 'd', &min_job_age, + "MpichGmDirectSupport=", 'd', &mpich_gm_dir, "PluginDir=", 's', &plugindir, "Prolog=", 's', &prolog, "ReturnToService=", 'd', &ret2service, @@ -391,6 +590,20 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) job_credential_public_certificate; } + if ( kill_tree != -1) { + if ( ctl_conf_ptr->kill_tree != (uint16_t) NO_VAL) + error (MULTIPLE_VALUE_MSG, "KillTree"); +#if HAVE_AIX + if (kill_tree) { + error("KillTree=%d presently invalid on AIX", + kill_tree); + kill_tree = 0; + } +#else + ctl_conf_ptr->kill_tree = kill_tree; +#endif + } + if ( kill_wait != -1) { if ( ctl_conf_ptr->kill_wait != (uint16_t) NO_VAL) error (MULTIPLE_VALUE_MSG, "KillWait"); @@ -409,6 +622,12 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) ctl_conf_ptr->min_job_age = min_job_age; } + if ( mpich_gm_dir != -1) { + if ( ctl_conf_ptr->mpich_gm_dir != (uint16_t) NO_VAL) + error (MULTIPLE_VALUE_MSG, "MpichGmDirectSupport"); + ctl_conf_ptr->mpich_gm_dir = mpich_gm_dir; + } + if ( plugindir ) { if ( ctl_conf_ptr->plugindir ) { error( MULTIPLE_VALUE_MSG, "PluginDir" ); @@ -602,12 +821,14 @@ _parse_node_spec (char *in_line) int error_code; char *feature = NULL, *node_addr = NULL, *node_name = NULL; char *state = NULL, *reason=NULL; + char *node_hostname = NULL; int cpus_val, real_memory_val, tmp_disk_val, weight_val; error_code = slurm_parser (in_line, "Feature=", 's', &feature, "NodeAddr=", 's', &node_addr, "NodeName=", 's', &node_name, + "NodeHostname=", 's', &node_hostname, "Procs=", 'd', &cpus_val, "RealMemory=", 'd', &real_memory_val, "Reason=", 's', &reason, @@ -619,13 +840,18 @@ _parse_node_spec (char *in_line) if (error_code) return error_code; + if (node_name) { + register_conf_node_aliases(node_name, node_hostname); + } + xfree(feature); xfree(node_addr); xfree(node_name); + xfree(node_hostname); xfree(reason); xfree(state); - return 0; + return error_code; } /* @@ -791,7 +1017,7 @@ void validate_config (slurm_ctl_conf_t *ctl_conf_ptr) { if ((ctl_conf_ptr->backup_controller != NULL) && - (strcmp("localhost", ctl_conf_ptr->backup_controller) == 0)) { + (strcasecmp("localhost", ctl_conf_ptr->backup_controller) == 0)) { xfree (ctl_conf_ptr->backup_controller); ctl_conf_ptr->backup_controller = xmalloc (MAX_NAME_LEN); if ( getnodename (ctl_conf_ptr->backup_controller, @@ -812,7 +1038,7 @@ validate_config (slurm_ctl_conf_t *ctl_conf_ptr) if (ctl_conf_ptr->control_machine == NULL) fatal ("validate_config: ControlMachine not specified."); - else if (strcmp("localhost", ctl_conf_ptr->control_machine) == 0) { + else if (strcasecmp("localhost", ctl_conf_ptr->control_machine) == 0) { xfree (ctl_conf_ptr->control_machine); ctl_conf_ptr->control_machine = xmalloc (MAX_NAME_LEN); if ( getnodename (ctl_conf_ptr->control_machine, @@ -864,6 +1090,9 @@ validate_config (slurm_ctl_conf_t *ctl_conf_ptr) if (ctl_conf_ptr->job_comp_type == NULL) ctl_conf_ptr->job_comp_type = xstrdup(DEFAULT_JOB_COMP_TYPE); + if (ctl_conf_ptr->kill_tree == (uint16_t) NO_VAL) + ctl_conf_ptr->kill_tree = DEFAULT_KILL_TREE; + if (ctl_conf_ptr->kill_wait == (uint16_t) NO_VAL) ctl_conf_ptr->kill_wait = DEFAULT_KILL_WAIT; @@ -873,6 +1102,9 @@ validate_config (slurm_ctl_conf_t *ctl_conf_ptr) if (ctl_conf_ptr->min_job_age == (uint16_t) NO_VAL) ctl_conf_ptr->min_job_age = DEFAULT_MIN_JOB_AGE; + if (ctl_conf_ptr->mpich_gm_dir == (uint16_t) NO_VAL) + ctl_conf_ptr->mpich_gm_dir = DEFAULT_MPICH_GM_DIR; + if (ctl_conf_ptr->plugindir == NULL) ctl_conf_ptr->plugindir = xstrdup(SLURM_PLUGIN_PATH); diff --git a/src/common/read_config.h b/src/common/read_config.h index 4666d88d0fe3a1bae09b8e218bb3eff01f2eb212..789e4725fcd22fe2198517524698c41819af9ed2 100644 --- a/src/common/read_config.h +++ b/src/common/read_config.h @@ -57,6 +57,8 @@ #define DEFAULT_SWITCH_TYPE "switch/none" #define DEFAULT_TMP_FS "/tmp" #define DEFAULT_WAIT_TIME 0 +#define DEFAULT_MPICH_GM_DIR 0 +#define DEFAULT_KILL_TREE 0 /* * init_slurm_conf - initialize or re-initialize the slurm configuration @@ -76,9 +78,28 @@ extern void free_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr); * getnodename - equivalent to gethostname(), but return only the first * component of the fully qualified name (e.g. "linux123.foo.bar" * becomes "linux123") + * NOTE: NodeName in the config may be different from real hostname. + * Use get_conf_node_name() to get the former. */ extern int getnodename (char *name, size_t len); +/* + * Register the given NodeName in the alias table. + * If node_hostname is NULL, only node_name will be used and + * no lookup table record is created. + */ +extern void register_conf_node_aliases(char *node_name, char *node_hostname); + +/* + * get_conf_node_hostname - Return the NodeHostname for given NodeName + */ +extern char *get_conf_node_hostname(char *node_name); + +/* + * get_conf_node_name - Return the NodeName for given NodeHostname + */ +extern char *get_conf_node_name(char *node_hostname); + /* * parse_config_spec - parse the overall configuration specifications, update * values diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index b280bdec0913f7409b8361cbbec2a1781932ab96..4a900a0065d46ea3183d677285497e311bb39c87 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -340,6 +340,20 @@ uint16_t slurm_get_wait_time(void) return wait_time; } +/* slurm_get_mpich_gm_dir + * returns mpich_gm_dir from slurmctld_conf object + * RET uint16_t - mpich_gm_dir + */ +uint16_t slurm_get_mpich_gm_dir(void) +{ + uint16_t mpich_gm_dir; + + _lock_update_config(); + mpich_gm_dir = slurmctld_conf.mpich_gm_dir; + slurm_mutex_unlock(&config_lock); + return mpich_gm_dir; +} + /* Change general slurm communication errors to slurmctld specific errors */ static void _remap_slurmctld_errno(void) { diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index cd232664c75421d389a1fd373d86fee71d1c5ea0..5cc9acaa591be0eae55f8e3e48148ce044937ba1 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -166,6 +166,12 @@ char *slurm_get_switch_type(void); */ uint16_t slurm_get_wait_time(void); +/* slurm_get_mpich_gm_dir + * returns mpich_gm_dir from slurmctld_conf object + * RET uint16_t - mpich_gm_dir + */ +uint16_t slurm_get_mpich_gm_dir(void); + /**********************************************************************\ * general message management functions used by slurmctld, slurmd \**********************************************************************/ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 61903c0a62bfe7045a852ddd4d7ea68ab8c70f22..c2d47a1d2dc049b83626656f5ebb0976cd947fad 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1711,9 +1711,11 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) pack16(build_ptr->inactive_limit, buffer); packstr(build_ptr->job_comp_loc, buffer); packstr(build_ptr->job_comp_type, buffer); + pack16(build_ptr->kill_tree, buffer); pack16(build_ptr->kill_wait, buffer); pack16(build_ptr->max_job_cnt, buffer); pack16(build_ptr->min_job_age, buffer); + pack16(build_ptr->mpich_gm_dir, buffer); packstr(build_ptr->plugindir, buffer); packstr(build_ptr->prolog, buffer); pack16(build_ptr->ret2service, buffer); @@ -1773,9 +1775,11 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** safe_unpack16(&build_ptr->inactive_limit, buffer); safe_unpackstr_xmalloc(&build_ptr->job_comp_loc, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&build_ptr->job_comp_type, &uint16_tmp, buffer); + safe_unpack16(&build_ptr->kill_tree, buffer); safe_unpack16(&build_ptr->kill_wait, buffer); safe_unpack16(&build_ptr->max_job_cnt, buffer); safe_unpack16(&build_ptr->min_job_age, buffer); + safe_unpack16(&build_ptr->mpich_gm_dir, buffer); safe_unpackstr_xmalloc(&build_ptr->plugindir, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&build_ptr->prolog, &uint16_tmp, buffer); safe_unpack16(&build_ptr->ret2service, buffer); @@ -2669,6 +2673,7 @@ _pack_batch_job_launch_msg(batch_job_launch_msg_t * msg, Buf buffer) pack32(msg->job_id, buffer); pack32(msg->uid, buffer); + pack32(msg->nprocs, buffer); pack16(msg->num_cpu_groups, buffer); pack32_array(msg->cpus_per_node, msg->num_cpu_groups, buffer); @@ -2704,6 +2709,7 @@ _unpack_batch_job_launch_msg(batch_job_launch_msg_t ** msg, Buf buffer) safe_unpack32(&launch_msg_ptr->job_id, buffer); safe_unpack32(&launch_msg_ptr->uid, buffer); + safe_unpack32(&launch_msg_ptr->nprocs, buffer); safe_unpack16(&launch_msg_ptr->num_cpu_groups, buffer); if (launch_msg_ptr->num_cpu_groups > 0) { diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 724f5acd5712d53a63aea87cec314810f5f66d3a..8a4cc6882b98f0efdd2299e766680c89d80e4d69 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -154,7 +154,7 @@ extern int select_p_part_init(List part_list) /* * select_p_job_test - Given a specification of scheduling requirements, - * identify the nodes which "best" satify the request. + * identify the nodes which "best" satisfy the request. * "best" is defined as either single set of consecutive nodes satisfying * the request and leaving the minimum number of unused nodes OR * the fewest number of consecutive node sets @@ -373,8 +373,9 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, consec_cpus[best_fit_location] = 0; consec_nodes[best_fit_location] = 0; } + if (error_code && (rem_cpus <= 0) && - max_nodes && ((max_nodes - rem_nodes) >= min_nodes)) + ((max_nodes == 0) || ((max_nodes - rem_nodes) >= min_nodes))) error_code = SLURM_SUCCESS; xfree(consec_cpus); diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index 4a5a1ead5bdf5f407c426791f3b958a1e08a7826..ce65b6b6943bf425451594054a1ffac003a1b7f8 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -88,8 +88,6 @@ static int _load_jobs (job_info_msg_t ** job_buffer_pptr); static int _load_nodes (node_info_msg_t ** node_buffer_pptr, uint16_t show_flags); static int _load_partitions (partition_info_msg_t **part_info_pptr); -static void _parse_conf_line (char *in_line, bool *any_slurmctld, - bool *have_slurmctld, bool *have_slurmd); static void _pid_info(pid_t job_pid); static void _ping_slurmctld(slurm_ctl_conf_info_msg_t *slurm_ctl_conf_ptr); static void _print_completing (void); @@ -665,134 +663,40 @@ _ping_slurmctld(slurm_ctl_conf_info_msg_t *slurm_ctl_conf_ptr) static void _print_daemons (void) { + slurm_ctl_conf_info_msg_t conf; + char me[MAX_NAME_LEN], *b, *c, *n; + int actld = 0, ctld = 0, d = 0; char daemon_list[] = "slurmctld slurmd"; - FILE *slurm_spec_file; - int line_num, line_size, i, j; - char in_line[BUF_SIZE]; - bool have_slurmctld = false, have_slurmd = false; - bool any_slurmctld = false; - - slurm_spec_file = fopen (SLURM_CONFIG_FILE, "r"); - if (slurm_spec_file == NULL) { - if (quiet_flag == -1) - fprintf(stderr, "Can't open %s\n", - SLURM_CONFIG_FILE); - exit(1); - } - /* process the data file */ - line_num = 0; - while (fgets (in_line, BUF_SIZE, slurm_spec_file) != NULL) { - line_num++; - line_size = strlen (in_line); - if (line_size >= (BUF_SIZE - 1)) { - exit_code = 1; - if (quiet_flag == -1) - fprintf(stderr, - "Line %d of config file %s too long\n", - line_num, SLURM_CONFIG_FILE); - continue; /* bad config file */ - } - - /* everything after a non-escaped "#" is a comment */ - /* replace comment flag "#" with a `\0' (End of string) */ - /* an escaped value "\#" is translated to "#" */ - /* this permitted embedded "#" in node/partition names */ - for (i = 0; i < line_size; i++) { - if (in_line[i] == '\0') - break; - if (in_line[i] != '#') - continue; - if ((i > 0) && (in_line[i - 1] == '\\')) { - for (j = i; j < line_size; j++) { - in_line[j - 1] = in_line[j]; - } - line_size--; - continue; - } - in_line[i] = '\0'; - break; - } - - _parse_conf_line (in_line, &any_slurmctld, - &have_slurmctld, &have_slurmd); - if (have_slurmctld && have_slurmd) - break; + bzero(&conf, sizeof(conf)); + if (read_slurm_conf_ctl(&conf) != SLURM_SUCCESS) + return; + getnodename(me, MAX_NAME_LEN); + if ((b = conf.backup_controller)) { + if ((strcmp(b, me) == 0) || + (strcasecmp(b, "localhost") == 0)) + ctld = 1; + } + if ((c = conf.control_machine)) { + actld = 1; + if ((strcmp(c, me) == 0) || + (strcasecmp(c, "localhost") == 0)) + ctld = 1; + } + if ((n = get_conf_node_name(me))) { + d = 1; + xfree(n); } - fclose (slurm_spec_file); + free_slurm_conf(&conf); strcpy(daemon_list, ""); - if (any_slurmctld && have_slurmctld) + if (actld && ctld) strcat(daemon_list, "slurmctld "); - if (any_slurmctld && have_slurmd) + if (actld && d) strcat(daemon_list, "slurmd"); fprintf (stdout, "%s\n", daemon_list) ; } -/* _parse_conf_line - determine if slurmctld or slurmd location identified */ -static void _parse_conf_line (char *in_line, bool *any_slurmctld, - bool *have_slurmctld, bool *have_slurmd) -{ - int error_code; - char *backup_controller = NULL, *control_machine = NULL; - char *node_name = NULL, *node_addr = NULL; - static char *this_host = NULL; - - error_code = slurm_parser (in_line, - "BackupController=", 's', &backup_controller, - "ControlMachine=", 's', &control_machine, - "NodeAddr=", 's', &node_addr, - "NodeName=", 's', &node_name, - "END"); - if (error_code) { - exit_code = 1; - if (quiet_flag == -1) - fprintf(stderr, "Can't parse %s of %s\n", - in_line, SLURM_CONFIG_FILE); - return; - } - - if (this_host == NULL) { - this_host = xmalloc(MAX_NAME_LEN); - getnodename(this_host, MAX_NAME_LEN); - } - - if (backup_controller) { - if ((strcmp(backup_controller, this_host) == 0) || - (strcasecmp(backup_controller, "localhost") == 0)) - *have_slurmctld = true; - xfree(backup_controller); - } - if (control_machine) { - *any_slurmctld = true; - if ((strcmp(control_machine, this_host) == 0) || - (strcasecmp(control_machine, "localhost") == 0)) - *have_slurmctld = true; - xfree(control_machine); - } - if (node_name) { - char *node_entry; - hostlist_t node_list = hostlist_create(node_name); - while ((*have_slurmd == false) && - (node_entry = hostlist_shift(node_list)) ) { - if ((strcmp(node_entry, this_host) == 0) || - (strcmp(node_entry, "localhost") == 0)) - *have_slurmd = true; - free(node_entry); - } - hostlist_destroy(node_list); - xfree(node_name); - } -#if HAVE_FRONT_END - if (node_addr) { - if ((strcmp(node_addr, this_host) == 0) - || (strcasecmp(node_addr, "localhost") == 0)) - *have_slurmd = true; - } -#endif - xfree(node_addr); -} - /* * _print_job - print the specified job's information * IN job_id - job's id or NULL to print information about all jobs diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 59414c9b467f4cd2f7f880765621e17481d49331..5a13f71b61b0cc2c822e6aeac481e27b2b3db175 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -609,6 +609,7 @@ void _dump_job_details(struct job_details *detail_ptr, Buf buffer) pack32((uint32_t) detail_ptr->max_nodes, buffer); pack32((uint32_t) detail_ptr->total_procs, buffer); + pack16((uint16_t) detail_ptr->req_tasks, buffer); pack16((uint16_t) detail_ptr->shared, buffer); pack16((uint16_t) detail_ptr->contiguous, buffer); @@ -636,7 +637,7 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer) char *err = NULL, *in = NULL, *out = NULL, *work_dir = NULL; char **argv = (char **) NULL; uint32_t min_nodes, max_nodes, min_procs; - uint16_t argc = 0, shared, contiguous, name_len; + uint16_t argc = 0, req_tasks, shared, contiguous, name_len; uint32_t min_memory, min_tmp_disk, total_procs; time_t submit_time; int i; @@ -646,6 +647,7 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer) safe_unpack32(&max_nodes, buffer); safe_unpack32(&total_procs, buffer); + safe_unpack16(&req_tasks, buffer); safe_unpack16(&shared, buffer); safe_unpack16(&contiguous, buffer); @@ -689,6 +691,7 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer) job_ptr->details->min_nodes = min_nodes; job_ptr->details->max_nodes = max_nodes; job_ptr->details->total_procs = total_procs; + job_ptr->details->req_tasks = req_tasks; job_ptr->details->shared = shared; job_ptr->details->contiguous = contiguous; job_ptr->details->min_procs = min_procs; diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index c1527c9a09a074badf2df5ff7f56dd258e331b5f..c1c772938519d2528d8f967716a62e894ccfea71 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -277,6 +277,7 @@ static void _launch_job(struct job_record *job_ptr) xmalloc(sizeof(batch_job_launch_msg_t)); launch_msg_ptr->job_id = job_ptr->job_id; launch_msg_ptr->uid = job_ptr->user_id; + launch_msg_ptr->nprocs = job_ptr->details->req_tasks; launch_msg_ptr->nodes = xstrdup(job_ptr->nodes); launch_msg_ptr->err = xstrdup(job_ptr->details->err); launch_msg_ptr->in = xstrdup(job_ptr->details->in); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 2874cac6b8d749fe566212d5b3ba45a941aadd14..02dc5c063af7c3fb6a087bdb85cf77afc84681cd 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -517,6 +517,7 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, /* determine if job could possibly run (if all configured * nodes available) */ + if ((!runable_ever || !runable_avail) && (total_nodes >= min_nodes) && ((slurmctld_conf.fast_schedule == 0) || diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 01c11856883d2d18a2c42e24b8abe978091f477a..63970823b5de7c9ee1c5c5b15fe14363b2bd16df 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -260,9 +260,11 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) job_credential_private_key); conf_ptr->job_credential_public_certificate = xstrdup(slurmctld_conf. job_credential_public_certificate); + conf_ptr->kill_tree = slurmctld_conf.kill_tree; conf_ptr->kill_wait = slurmctld_conf.kill_wait; conf_ptr->max_job_cnt = slurmctld_conf.max_job_cnt; conf_ptr->min_job_age = slurmctld_conf.min_job_age; + conf_ptr->mpich_gm_dir = slurmctld_conf.mpich_gm_dir; conf_ptr->plugindir = xstrdup(slurmctld_conf.plugindir); conf_ptr->prolog = xstrdup(slurmctld_conf.prolog); conf_ptr->ret2service = slurmctld_conf.ret2service; @@ -470,7 +472,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg) slurm_send_rc_msg(msg, ESLURM_USER_ID_MISSING); return; } -#ifdef HAVE_FRONT_END +#ifdef HAVE_FRONT_END /* Limited job step support */ /* Non-super users not permitted to run job steps on front-end. * A single slurmd can not handle a heavy load. */ if (!_is_super_user(uid)) { @@ -959,7 +961,7 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) return; } -#ifdef HAVE_FRONT_END +#ifdef HAVE_FRONT_END /* Limited job step support */ /* Non-super users not permitted to run job steps on front-end. * A single slurmd can not handle a heavy load. */ if (!_is_super_user(uid)) { diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 97f583c79644cb1549304faf07162e78925f7cba..e1bb69765fa5ef7f7449a790ac5c078ae55e3b9b 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -57,7 +57,8 @@ #include "src/slurmctld/sched_plugin.h" #include "src/slurmctld/slurmctld.h" -#define BUF_SIZE 1024 +#define BUF_SIZE 1024 +#define MAX_NAME_LEN 32 static int _build_bitmaps(void); static int _init_all_slurm_conf(void); @@ -303,7 +304,7 @@ static int _init_all_slurm_conf(void) static int _parse_node_spec(char *in_line) { char *node_addr = NULL, *node_name = NULL, *state = NULL; - char *feature = NULL, *reason = NULL; + char *feature = NULL, *reason = NULL, *node_hostname = NULL; int error_code, first, i; int state_val, cpus_val, real_memory_val, tmp_disk_val, weight_val; struct node_record *node_ptr; @@ -321,10 +322,16 @@ static int _parse_node_spec(char *in_line) return error_code; if (node_name == NULL) return 0; /* no node info */ + if (strcasecmp(node_name, "localhost") == 0) { + xfree(node_name); + node_name = xmalloc(MAX_NAME_LEN); + getnodename(node_name, MAX_NAME_LEN); + } error_code = slurm_parser(in_line, "Feature=", 's', &feature, "NodeAddr=", 's', &node_addr, + "NodeHostname=", 's', &node_hostname, "Procs=", 'd', &cpus_val, "RealMemory=", 'd', &real_memory_val, "Reason=", 's', &reason, @@ -355,7 +362,7 @@ static int _parse_node_spec(char *in_line) xfree(state); } -#ifndef HAVE_FRONT_END /* Fake node addresses for front-end */ +#ifndef HAVE_FRONT_END /* Support NodeAddr expression */ if (node_addr && ((addr_list = hostlist_create(node_addr)) == NULL)) { error("hostlist_create error for %s: %m", node_addr); @@ -372,13 +379,6 @@ static int _parse_node_spec(char *in_line) first = 1; while ((this_node_name = hostlist_shift(host_list))) { - if (strcmp(this_node_name, "localhost") == 0) { - free(this_node_name); - this_node_name = malloc(128); - if (this_node_name == NULL) - fatal ("memory allocation failure"); - getnodename(this_node_name, 128); - } if (strcasecmp(this_node_name, "DEFAULT") == 0) { xfree(node_name); if (cpus_val != NO_VAL) @@ -437,10 +437,13 @@ static int _parse_node_spec(char *in_line) (state_val != NODE_STATE_UNKNOWN)) node_ptr->node_state = state_val; node_ptr->last_response = (time_t) 0; -#ifdef HAVE_FRONT_END /* Fake node addresses for front-end */ +#ifdef HAVE_FRONT_END /* Permit NodeAddr value reuse for front-end */ if (node_addr) strncpy(node_ptr->comm_name, node_addr, MAX_NAME_LEN); + else if (node_hostname) + strncpy(node_ptr->comm_name, + node_hostname, MAX_NAME_LEN); else strncpy(node_ptr->comm_name, node_ptr->name, MAX_NAME_LEN); @@ -484,6 +487,7 @@ static int _parse_node_spec(char *in_line) cleanup: xfree(node_addr); xfree(node_name); + xfree(node_hostname); xfree(feature); xfree(reason); xfree(state); @@ -708,12 +712,10 @@ static int _parse_part_spec(char *in_line) allow_groups = NULL; } if (nodes) { - if (strcmp(nodes, "localhost") == 0) { + if (strcasecmp(nodes, "localhost") == 0) { xfree(nodes); - nodes = xmalloc(128); - if (nodes == NULL) - fatal ("memory allocation failure"); - getnodename(nodes, 128); + nodes = xmalloc(MAX_NAME_LEN); + getnodename(nodes, MAX_NAME_LEN); } if (part_ptr->nodes) { xstrcat(part_ptr->nodes, ","); diff --git a/src/slurmd/Makefile.am b/src/slurmd/Makefile.am index f10755ce8100eed3a53dd5325caaebadc13babfe..3e8c1323a201283d7e16456d6d0cf3762f88a9a2 100644 --- a/src/slurmd/Makefile.am +++ b/src/slurmd/Makefile.am @@ -29,6 +29,7 @@ slurmd_SOURCES = \ shm.c shm.h \ fname.c fname.h \ ulimits.c ulimits.h \ + kill_tree.c kill_tree.h \ setproctitle.c setproctitle.h slurmd_LDFLAGS = -export-dynamic $(CMD_LDFLAGS) diff --git a/src/slurmd/kill_tree.c b/src/slurmd/kill_tree.c new file mode 100644 index 0000000000000000000000000000000000000000..986c28738b2c25ba33903c2f182b062b0be8909b --- /dev/null +++ b/src/slurmd/kill_tree.c @@ -0,0 +1,223 @@ +/*****************************************************************************\ + * src/slurmd/kill_tree.c - Kill process tree based upon process IDs + * Used primarily for MPICH-GM + ***************************************************************************** + * Copyright (C) 2004 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Takao Hatazaki <takao.hatazaki@hp.com> + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include <sys/types.h> +#include <dirent.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdlib.h> +#include <stdio.h> +#include <signal.h> +#include <strings.h> +#include <unistd.h> + +#include "src/common/xmalloc.h" +#include "src/common/log.h" + +typedef struct xpid_s { + pid_t pid; + struct xpid_s *next; +} xpid_t; + +typedef struct xppid_s { + pid_t ppid; + xpid_t *list; + struct xppid_s *next; +} xppid_t; + +#define HASH_LEN 64 +static xppid_t **hashtbl; + +#define GET_HASH_IDX(ppid) ((ppid)%HASH_LEN) + +static xpid_t *_alloc_pid(pid_t pid, xpid_t *next) +{ + xpid_t *new; + + new = (xpid_t *)xmalloc(sizeof(*new)); + new->pid = pid; + new->next = next; + return new; +} + +static xppid_t *_alloc_ppid(pid_t ppid, pid_t pid, xppid_t *next) +{ + xppid_t *new; + + new = xmalloc(sizeof(*new)); + new->ppid = ppid; + new->list = _alloc_pid(pid, NULL); + new->next = next; + return new; +} + +static void _push_to_hashtbl(pid_t ppid, pid_t pid) +{ + int idx; + xppid_t *ppids, *newppid; + xpid_t *newpid; + + idx = GET_HASH_IDX(ppid); + ppids = hashtbl[idx]; + while (ppids) { + if (ppids->ppid == ppid) { + newpid = _alloc_pid(pid, ppids->list); + ppids->list = newpid; + return; + } + ppids = ppids->next; + } + newppid = _alloc_ppid(ppid, pid, hashtbl[idx]); + hashtbl[idx] = newppid; +} + +static void _build_hashtbl() +{ + DIR *dir; + struct dirent *de; + char path[NAME_MAX], *endptr, *num, rbuf[1024]; + int fd; + long pid, ppid; + + if ((dir = opendir("/proc")) == NULL) { + error("opendir(/proc): %m"); + return; + } + + hashtbl = (xppid_t **)xmalloc(HASH_LEN * sizeof(xppid_t *)); + + while ((de = readdir(dir)) != NULL) { + num = de->d_name; + strtol(num, &endptr, 10); + if (endptr == NULL || *endptr != 0) + continue; + sprintf(path, "/proc/%s/stat", num); + if ((fd = open(path, O_RDONLY)) < 0) { + continue; + } + if (read(fd, rbuf, 1024) <= 0) { + close(fd); + continue; + } + if (sscanf(rbuf, "%ld %*s %*s %ld", &pid, &ppid) != 2) { + close(fd); + continue; + } + close(fd); + _push_to_hashtbl((pid_t)ppid, (pid_t)pid); + } + closedir(dir); +} + +static void _destroy_hashtbl() +{ + int i; + xppid_t *ppid, *tmp2; + xpid_t *list, *tmp; + + for (i=0; i<HASH_LEN; i++) { + ppid = hashtbl[i]; + while (ppid) { + list = ppid->list; + while (list) { + tmp = list->next; + xfree(list); + list = tmp; + } + tmp2 = ppid->next; + xfree(ppid); + ppid = tmp2; + } + } +} + + +static xpid_t *_get_list(int top, xpid_t *list) +{ + xppid_t *ppid; + xpid_t *children; + + ppid = hashtbl[GET_HASH_IDX(top)]; + while (ppid) { + if (ppid->ppid == top) { + children = ppid->list; + while (children) { + list = _alloc_pid(children->pid, list); + children = children->next; + } + children = ppid->list; + while (children) { + list = _get_list(children->pid, list); + children = children->next; + } + break; + } + ppid = ppid->next; + } + return list; +} + +static void _destroy_list(xpid_t *list) +{ + xpid_t *tmp; + + while (list) { + tmp = list->next; + xfree(list); + list = tmp; + } +} + +static void _kill_proclist(xpid_t *list, int sig) +{ + while (list) { + verbose("Sending %d to %d", sig, list->pid); + /* Do not check errors. May already be dead */ + kill(list->pid, sig); + list = list->next; + } +} + +extern void kill_proc_tree(pid_t top, int sig) +{ + /* + * Some of processes may not be in the same process group + * (e.g. GMPI processes). So, find out the process tree, + * then kill all that subtree. + */ + xpid_t *list; + + _build_hashtbl(); + list = _get_list(top, _alloc_pid(top, NULL)); + _kill_proclist(list, sig); + _destroy_hashtbl(); + _destroy_list(list); +} diff --git a/src/slurmd/kill_tree.h b/src/slurmd/kill_tree.h new file mode 100644 index 0000000000000000000000000000000000000000..95e41f7d41491603fabba020e117295653ef96e9 --- /dev/null +++ b/src/slurmd/kill_tree.h @@ -0,0 +1,39 @@ +/*****************************************************************************\ + * gmpi.h - srun support for MPICH-GM (GMPI) + ***************************************************************************** + * Copyright (C) 2004 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Takao Hatazaki <takao.hatazaki@hp.com> + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#ifndef _HAVE_KILL_TREE_H +#define _HAVE_KILL_TREE_H + +#include <sys/types.h> + +extern void kill_proc_tree(pid_t top, int sig); +/* + * Some of processes may not be in the same process group + * (e.g. GMPI processes). So, find out the process tree, + * then kill all that subtree. + */ + +#endif /* _HAVE_KILL_TREE_H */ diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index bd15d6b184510e939d9cc02d682caf6959b129ce..f995bb94cbd1c3769919b55a70e31dab4c41c941 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -132,7 +132,7 @@ static void _set_unexited_task_status(slurmd_job_t *job, int status); static int _send_pending_exit_msgs(slurmd_job_t *job); static void _setargs(slurmd_job_t *job); -static void _set_launch_ip_in_env(slurmd_job_t *, slurm_addr *cli); +static void _set_mgr_env(slurmd_job_t *, slurm_addr *cli, slurm_addr *self); static void _random_sleep(slurmd_job_t *job); static char *_sprint_task_cnt(batch_job_launch_msg_t *msg); @@ -152,8 +152,9 @@ static void _hup_handler(int sig) {;} /* * Launch an job step on the current node */ -int -mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *cli) +extern int +mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *cli, + slurm_addr *self) { slurmd_job_t *job = NULL; @@ -166,7 +167,7 @@ mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *cli) _setargs(job); - _set_launch_ip_in_env(job, cli); + _set_mgr_env(job, cli, self); if (_job_mgr(job) < 0) return SLURM_ERROR; @@ -231,7 +232,8 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli) * Spawn a task / job step on the current node */ int -mgr_spawn_task(spawn_task_request_msg_t *msg, slurm_addr *cli) +mgr_spawn_task(spawn_task_request_msg_t *msg, slurm_addr *cli, + slurm_addr *self) { slurmd_job_t *job = NULL; @@ -243,7 +245,7 @@ mgr_spawn_task(spawn_task_request_msg_t *msg, slurm_addr *cli) _setargs(job); - _set_launch_ip_in_env(job, cli); + _set_mgr_env(job, cli, self); if (_job_mgr(job) < 0) return SLURM_ERROR; @@ -968,6 +970,7 @@ _setup_batch_env(slurmd_job_t *job, batch_job_launch_msg_t *msg) hostlist_ranged_string(hl, 1024, buf); setenvpf(&job->env, "SLURM_JOBID", "%u", job->jobid); + setenvpf(&job->env, "SLURM_NPROCS", "%u", msg->nprocs); setenvpf(&job->env, "SLURM_NNODES", "%u", hostlist_count(hl)); setenvpf(&job->env, "SLURM_NODELIST", "%s", buf); hostlist_destroy(hl); @@ -1020,7 +1023,7 @@ _send_launch_failure (launch_tasks_request_msg_t *msg, slurm_addr *cli, int rc) resp_msg.data = &resp; resp_msg.msg_type = RESPONSE_LAUNCH_TASKS; - resp.node_name = conf->hostname; + resp.node_name = conf->node_name; resp.srun_node_id = msg->srun_node_id; resp.return_code = rc ? rc : -1; resp.count_of_pids = 0; @@ -1047,7 +1050,7 @@ _send_launch_resp(slurmd_job_t *job, int rc) resp_msg.data = &resp; resp_msg.msg_type = RESPONSE_LAUNCH_TASKS; - resp.node_name = conf->hostname; + resp.node_name = conf->node_name; resp.srun_node_id = job->nodeid; resp.return_code = rc; resp.count_of_pids = job->ntasks; @@ -1073,7 +1076,7 @@ _complete_job(uint32_t jobid, int err, int status) req.job_step_id = NO_VAL; req.job_rc = status; req.slurm_rc = err; - req.node_name = conf->hostname; + req.node_name = conf->node_name; req_msg.msg_type= REQUEST_COMPLETE_JOB_STEP; req_msg.data = &req; @@ -1213,7 +1216,7 @@ _setargs(slurmd_job_t *job) } static void -_set_launch_ip_in_env(slurmd_job_t *job, slurm_addr *cli) +_set_mgr_env(slurmd_job_t *job, slurm_addr *cli, slurm_addr *self) { char *p; char addrbuf[INET_ADDRSTRLEN]; @@ -1229,6 +1232,14 @@ _set_launch_ip_in_env(slurmd_job_t *job, slurm_addr *cli) *p = '\0'; setenvpf (&job->env, "SLURM_LAUNCH_NODE_IPADDR", "%s", addrbuf); + + if (getenvp(job->env, "SLURM_GMPI")) { + setenvpf (&job->env, "GMPI_MASTER", "%s", addrbuf); + slurm_print_slurm_addr (self, addrbuf, INET_ADDRSTRLEN); + if ((p = strchr (addrbuf, ':')) != NULL) *p = '\0'; + setenvpf (&job->env, "GMPI_SLAVE", "%s", addrbuf); + } + return; } diff --git a/src/slurmd/mgr.h b/src/slurmd/mgr.h index db667ea244f24b4b9d3b8a193f4d381f5232f9a3..e012676b9d8659c814b80777370933b2bace4169 100644 --- a/src/slurmd/mgr.h +++ b/src/slurmd/mgr.h @@ -36,11 +36,13 @@ /* Spawn a task / job step on this node */ -int mgr_spawn_task(spawn_task_request_msg_t *msg, slurm_addr *client); +int mgr_spawn_task(spawn_task_request_msg_t *msg, slurm_addr *client, + slurm_addr *self); /* Launch a job step on this node */ -int mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *client); +int mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *client, + slurm_addr *self); /* * Launch batch script on this node diff --git a/src/slurmd/req.c b/src/slurmd/req.c index b84d107d0e6d91111566e6f4145ce04aba343915..de5564069cb6c7f66180a2ac4f61ad7d025cc11d 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -35,7 +35,6 @@ #include <sys/param.h> #include <unistd.h> #include <stdlib.h> - #include <sys/poll.h> #include <sys/wait.h> @@ -45,6 +44,7 @@ #include "src/common/slurm_auth.h" #include "src/common/slurm_cred.h" #include "src/common/slurm_protocol_api.h" +#include "src/common/slurm_protocol_interface.h" #include "src/common/xstring.h" #include "src/common/xmalloc.h" #include "src/common/list.h" @@ -53,6 +53,7 @@ #include "src/slurmd/slurmd.h" #include "src/slurmd/shm.h" #include "src/slurmd/mgr.h" +#include "src/slurmd/kill_tree.h" #ifndef MAXHOSTNAMELEN #define MAXHOSTNAMELEN 64 @@ -62,7 +63,8 @@ static bool _slurm_authorized_user(uid_t uid); static bool _job_still_running(uint32_t job_id); static int _kill_all_active_steps(uint32_t jobid, int sig, bool batch); -static int _launch_tasks(launch_tasks_request_msg_t *, slurm_addr *); +static int _launch_tasks(launch_tasks_request_msg_t *, slurm_addr *, + slurm_addr *); static void _rpc_launch_tasks(slurm_msg_t *, slurm_addr *); static void _rpc_spawn_task(slurm_msg_t *, slurm_addr *); static void _rpc_batch_job(slurm_msg_t *, slurm_addr *); @@ -77,7 +79,8 @@ static void _rpc_pid2jid(slurm_msg_t *msg, slurm_addr *); static int _rpc_ping(slurm_msg_t *, slurm_addr *); static int _run_prolog(uint32_t jobid, uid_t uid); static int _run_epilog(uint32_t jobid, uid_t uid); -static int _spawn_task(spawn_task_request_msg_t *, slurm_addr *); +static int _spawn_task(spawn_task_request_msg_t *, slurm_addr *, + slurm_addr *); static bool _pause_for_job_completion (uint32_t jobid, int maxtime); static int _waiter_init (uint32_t jobid); @@ -267,23 +270,24 @@ _launch_batch_job(batch_job_launch_msg_t *req, slurm_addr *cli) } static int -_launch_tasks(launch_tasks_request_msg_t *req, slurm_addr *cli) +_launch_tasks(launch_tasks_request_msg_t *req, slurm_addr *cli, + slurm_addr *self) { int retval; if ((retval = _fork_new_slurmd()) == 0) - exit (mgr_launch_tasks(req, cli)); + exit (mgr_launch_tasks(req, cli, self)); return (retval <= 0) ? retval : 0; } static int -_spawn_task(spawn_task_request_msg_t *req, slurm_addr *cli) +_spawn_task(spawn_task_request_msg_t *req, slurm_addr *cli, slurm_addr *self) { int retval; if ((retval = _fork_new_slurmd()) == 0) - exit (mgr_spawn_task(req, cli)); + exit (mgr_spawn_task(req, cli, self)); return (retval <= 0) ? retval : 0; } @@ -331,7 +335,7 @@ _check_job_credential(slurm_cred_t cred, uint32_t jobid, goto fail; } - if (!hostset_within(hset, conf->hostname)) { + if (!hostset_within(hset, conf->node_name)) { error("job credential invald for this host [%d.%d %ld %s]", arg.jobid, arg.stepid, (long) arg.uid, arg.hostlist); goto fail; @@ -360,6 +364,8 @@ _rpc_launch_tasks(slurm_msg_t *msg, slurm_addr *cli) uint32_t jobid = req->job_id; uint32_t stepid = req->job_step_id; bool super_user = false, run_prolog = false; + slurm_addr self; + socklen_t adlen; req_uid = g_slurm_auth_get_uid(msg->cred); @@ -395,7 +401,9 @@ _rpc_launch_tasks(slurm_msg_t *msg, slurm_addr *cli) goto done; } - if (_launch_tasks(req, cli) < 0) + adlen = sizeof(self); + _slurm_getsockname(msg->conn_fd, (struct sockaddr *)&self, &adlen); + if (_launch_tasks(req, cli, &self) < 0) errnum = errno; done: @@ -430,6 +438,8 @@ _rpc_spawn_task(slurm_msg_t *msg, slurm_addr *cli) uint32_t jobid = req->job_id; uint32_t stepid = req->job_step_id; bool super_user = false, run_prolog = false; + slurm_addr self; + socklen_t adlen; req_uid = g_slurm_auth_get_uid(msg->cred); @@ -465,7 +475,9 @@ _rpc_spawn_task(slurm_msg_t *msg, slurm_addr *cli) goto done; } - if (_spawn_task(req, cli) < 0) + adlen = sizeof(self); + _slurm_getsockname(msg->conn_fd, (struct sockaddr *)&self, &adlen); + if (_spawn_task(req, cli, &self) < 0) errnum = errno; done: @@ -617,16 +629,20 @@ _rpc_kill_tasks(slurm_msg_t *msg, slurm_addr *cli_addr) goto done; } - if (kill(-step->sid, req->signal) < 0) - rc = errno; - - if (rc == SLURM_SUCCESS) - verbose("Sent signal %d to %u.%u", - req->signal, req->job_id, req->job_step_id); - else - verbose("Error sending signal %d to %u.%u: %s", - req->signal, req->job_id, req->job_step_id, - slurm_strerror(rc)); + if (conf->cf.kill_tree) { + kill_proc_tree(step->sid, req->signal); + rc = SLURM_SUCCESS; + } else { + if (kill(-step->sid, req->signal) < 0) + rc = errno; + if (rc == SLURM_SUCCESS) + verbose("Sent signal %d to %u.%u", + req->signal, req->job_id, req->job_step_id); + else + verbose("Error sending signal %d to %u.%u: %s", + req->signal, req->job_id, req->job_step_id, + slurm_strerror(rc)); + } done: if (step) @@ -835,7 +851,7 @@ _rpc_reattach_tasks(slurm_msg_t *msg, slurm_addr *cli) debug2("update step addrs rc = %d", rc); resp_msg.data = &resp; resp_msg.msg_type = RESPONSE_REATTACH_TASKS; - resp.node_name = conf->hostname; + resp.node_name = conf->node_name; resp.srun_node_id = req->srun_node_id; resp.return_code = rc; @@ -862,8 +878,11 @@ _kill_all_active_steps(uint32_t jobid, int sig, bool batch) int step_cnt = 0; while ((s = list_next(i))) { - if (s->jobid != jobid) /* wrong job */ + if (s->jobid != jobid) { + debug("Wrong job: s->jobid=%d, jobid=%d", + s->jobid, jobid); continue; + } if (s->sid <= 0) { debug ("bad sid value in shm for %d!", jobid); @@ -874,10 +893,16 @@ _kill_all_active_steps(uint32_t jobid, int sig, bool batch) continue; step_cnt++; - debug2("signal %d to job %u (pg:%d)", sig, jobid, s->sid); - if (kill(-s->sid, sig) < 0) - error("kill jid %d sid %d: %m", s->jobid, s->sid); + if (conf->cf.kill_tree) { + kill_proc_tree(s->sid, sig); + } else { + debug2("signal %d to job %u (pg:%d)", + sig, jobid, s->sid); + if (kill(-s->sid, sig) < 0) + error("kill jid %d sid %d: %m", + s->jobid, s->sid); + } } list_destroy(steps); if (step_cnt == 0) @@ -918,7 +943,7 @@ _epilog_complete(uint32_t jobid, int rc) req.job_id = jobid; req.return_code = rc; - req.node_name = conf->hostname; + req.node_name = conf->node_name; msg.msg_type = MESSAGE_EPILOG_COMPLETE; msg.data = &req; @@ -1161,5 +1186,3 @@ _run_epilog(uint32_t jobid, uid_t uid) slurm_mutex_unlock(&conf->config_mutex); return error_code; } - - diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index fb87a59b2c8991200a2d9a50e3c718f097c53dd9..df65666e114fbaf15a7f4fff97cf0f7dee8d5545 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -388,7 +388,7 @@ _fill_registration_msg(slurm_node_registration_status_msg_t *msg) job_step_t *s; int n; - msg->node_name = xstrdup (conf->hostname); + msg->node_name = xstrdup (conf->node_name); get_procs(&msg->cpus); get_memory(&msg->real_memory_size); @@ -475,6 +475,7 @@ _read_config() if (!conf->logfile) conf->logfile = xstrdup(conf->cf.slurmd_logfile); + _free_and_set(&conf->node_name, get_conf_node_name(conf->hostname)); _free_and_set(&conf->epilog, xstrdup(conf->cf.epilog)); _free_and_set(&conf->prolog, xstrdup(conf->cf.prolog)); _free_and_set(&conf->tmpfs, xstrdup(conf->cf.tmp_fs)); @@ -542,6 +543,7 @@ _init_conf() exit(1); } conf->hostname = xstrdup(host); + conf->node_name = NULL; conf->conffile = NULL; conf->epilog = NULL; conf->logfile = NULL; @@ -938,11 +940,21 @@ static void _install_fork_handlers(void) return; } -void slurmd_get_addr(slurm_addr *a, uint16_t *port, char *buf, uint32_t len) +extern void +slurmd_get_addr(slurm_addr *a, uint16_t *port, char *buf, uint32_t len) { +#if 0 slurm_mutex_lock(&fork_mutex); slurm_get_addr(a, port, buf, len); slurm_mutex_unlock(&fork_mutex); +#else + /* This function is used only for printing debug information. + Do not consult /etc/hosts or, more significantly, YP */ + unsigned char *uc = (unsigned char *)&a->sin_addr.s_addr; + xassert(len > 15); + *port = a->sin_port; + sprintf(buf, "%u.%u.%u.%u", uc[0], uc[1], uc[2], uc[3]); +#endif return; } diff --git a/src/slurmd/slurmd.h b/src/slurmd/slurmd.h index 285ece9f4f6ad78b2333f3b60dc8cfb531fa3a95..7df43005fab0d71d64578407f4b23192b8414eff 100644 --- a/src/slurmd/slurmd.h +++ b/src/slurmd/slurmd.h @@ -64,6 +64,7 @@ typedef struct slurmd_config { char ***argv; /* pointer to argument vector */ int *argc; /* pointer to argument count */ char *hostname; /* local hostname */ + char *node_name; /* node name */ char *conffile; /* config filename */ char *logfile; /* slurmd logfile, if any */ char *spooldir; /* SlurmdSpoolDir */ diff --git a/src/slurmd/smgr.c b/src/slurmd/smgr.c index dfb2ea41499a8e80d4211139e89cd5e60eef5955..a9ed0475cedb5b35bbb4480578e33c0c5f22c076 100644 --- a/src/slurmd/smgr.c +++ b/src/slurmd/smgr.c @@ -560,6 +560,10 @@ _setup_env(slurmd_job_t *job, int taskid) if (setenvpf(&job->env, "SLURM_PROCID", "%d", t->gid ) < 0) return -1; + if (getenvp(job->env, "SLURM_GMPI")) { + if (setenvpf(&job->env, "GMPI_ID", "%d", t->gid) < 0) + return -1; + } return SLURM_SUCCESS; } diff --git a/src/srun/Makefile.am b/src/srun/Makefile.am index cdac2fffb3d4d896b3afb1522965616f557311c5..93d85323c1c26f076a373cd928f9ab94baf3b0ac 100644 --- a/src/srun/Makefile.am +++ b/src/srun/Makefile.am @@ -12,6 +12,7 @@ srun_SOURCES = \ opt.c opt.h \ env.c env.h \ job.c job.h \ + gmpi.c gmpi.h \ net.c net.h \ msg.c msg.h \ io.c io.h \ diff --git a/src/srun/gmpi.c b/src/srun/gmpi.c new file mode 100644 index 0000000000000000000000000000000000000000..8c81987861a7de3736b63604be7b64cf12d0c1e3 --- /dev/null +++ b/src/srun/gmpi.c @@ -0,0 +1,329 @@ +/*****************************************************************************\ + * gmpi.c - srun support for MPICH-GM (GMPI) + ***************************************************************************** + * Copyright (C) 2004 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Takao Hatazaki <takao.hatazaki@hp.com> + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#ifdef WITH_PTHREADS +# include <pthread.h> +#endif + +#include <signal.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> + +#include "src/common/xmalloc.h" +#include "src/common/xstring.h" + +#include "src/srun/allocate.h" +#include "src/srun/env.h" +#include "src/srun/io.h" +#include "src/srun/job.h" +#include "src/srun/gmpi.h" +#include "src/srun/launch.h" +#include "src/srun/msg.h" +#include "src/srun/net.h" +#include "src/srun/opt.h" +#include "src/srun/signals.h" +#include "src/srun/sigstr.h" +#include "src/srun/reattach.h" +#include "src/srun/attach.h" + + +static int _gmpi_parse_init_recv_msg(job_t *job, char *rbuf, + gm_slave_t *slave_data); + + +static int _gmpi_parse_init_recv_msg(job_t *job, char *rbuf, + gm_slave_t *slave_data) +{ + unsigned int magic, id, port_board_id, unique_high_id, + unique_low_id, numanode, remote_pid, remote_port; + int got; + gm_slave_t *dp; + + got = sscanf(rbuf, "<<<%u:%u:%u:%u:%u:%u:%u::%u>>>", + &magic, &id, &port_board_id, &unique_high_id, + &unique_low_id, &numanode, &remote_pid, &remote_port); + if (got != 8) { + error("GMPI master received invalid init message"); + return -1; + } + if (magic != job->jobid) { + error("GMPI master received invalid magic number"); + return -1; + } + if (id >= opt.nprocs) + fatal("GMPI id is out of range"); + if (port_board_id == 0) + fatal("MPI id=%d was unable to open a GM port", id); + + dp = &slave_data[id]; + if (dp->defined) { + error("Ignoring the message from MPI id=%d", id); + return -1; + } + dp->defined = 1; + dp->port_board_id = port_board_id; + dp->unique_high_id = unique_high_id; + dp->unique_low_id = unique_low_id; + dp->numanode = numanode; + dp->remote_pid = remote_pid; + dp->remote_port = remote_port; + + debug3("slave_data[%d]: <<<%u:%u:%u:%u:%u:%u:%u::%u>>>", + id, magic, id, port_board_id, + dp->unique_high_id, dp->unique_low_id, dp->numanode, + dp->remote_pid, dp->remote_port); + return 0; +} + + +static int _gmpi_establish_map(job_t *job) +{ + struct sockaddr_in addr; + socklen_t addrlen; + int accfd, newfd, rlen, nprocs, i, j; + size_t gmaplen, lmaplen, maplen; + char *p, *rbuf = NULL, *gmap = NULL, *lmap = NULL, *map = NULL; + char tmp[128]; + gm_slave_t *slave_data = NULL, *dp; + + /* + * Collect info from slaves. + * Will never finish unless slaves are GMPI processes. + */ + accfd = job->gmpi_fd; + addrlen = sizeof(addr); + nprocs = opt.nprocs; + slave_data = (gm_slave_t *)xmalloc(sizeof(*slave_data)*nprocs); + for (i=0; i<nprocs; i++) + slave_data[i].defined = 0; + i = 0; + rbuf = (char *)xmalloc(GMPI_RECV_BUF_LEN); + while (i < nprocs) { + newfd = accept(accfd, (struct sockaddr *)&addr, &addrlen); + if (newfd == -1) { + error("accept(2) in GMPI master thread: %m"); + continue; + } + rlen = recv(newfd, rbuf, GMPI_RECV_BUF_LEN, 0); + if (rlen <= 0) { + error("GMPI master recv returned %d", rlen); + close(newfd); + continue; + } else { + rbuf[rlen] = 0; + } + if (_gmpi_parse_init_recv_msg(job, rbuf, slave_data) == 0) + i++; + close(newfd); + } + xfree(rbuf); + debug2("Received data from all of %d GMPI processes.", i); + + /* + * Compose the global map string. + */ + gmap = (char *)xmalloc(128*nprocs); + p = gmap; + strcpy(p, "[[["); + p += 3; + for (i=0; i<nprocs; i++) { + dp = &slave_data[i]; + sprintf(tmp, "<%u:%u:%u:%u>", dp->port_board_id, + dp->unique_high_id, dp->unique_low_id, dp->numanode); + strcpy(p, tmp); + p += strlen(tmp); + } + strcpy(p, "|||"); + p += 3; + gmaplen = (size_t)(p - gmap); + + /* + * Respond to slaves. + */ + lmap = (char *)xmalloc(128*nprocs); + for (i=0; i<nprocs; i++) { + /* + * Compose the string to send. + */ + dp = &slave_data[i]; + p = lmap; + for (j=0; j<nprocs; j++) { + if (job->hostid[i] == job->hostid[j] && + dp->numanode == slave_data[j].numanode) { + sprintf(tmp, "<%u>", j); + strcpy(p, tmp); + p += strlen(tmp); + } + } + lmaplen = (size_t)(p - lmap); + map = (char *)xmalloc(gmaplen+lmaplen+4); + strcpy(map, gmap); + strcpy(map+gmaplen, lmap); + strcpy(map+gmaplen+lmaplen, "]]]"); + maplen = gmaplen + lmaplen + 3; + + /* + * Send it. + */ + if ((newfd = socket(AF_INET, SOCK_STREAM, 0)) == -1) { + fatal("GMPI master failed to respond"); + } + j = 1; + if (setsockopt(newfd, SOL_SOCKET, SO_REUSEADDR, + (void *)&j, sizeof(j))) + error("setsockopt in GMPI master: %m"); + bzero(&addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr + = job->slurmd_addr[job->hostid[i]].sin_addr.s_addr; + addr.sin_port = htons(dp->remote_port); + if (connect(newfd, (struct sockaddr *)&addr, sizeof(addr))) + fatal("GMPI master failed to connect"); + send(newfd, map, maplen, 0); + close(newfd); + xfree(map); + } + xfree(slave_data); + xfree(lmap); + xfree(gmap); + + debug2("GMPI master responded to all GMPI processes"); + return 0; +} + + +static void _gmpi_wait_abort(job_t *job) +{ + struct sockaddr_in addr; + socklen_t addrlen; + int newfd, rlen; + unsigned int magic; + char *rbuf; + + rbuf = (char *)xmalloc(GMPI_RECV_BUF_LEN); + addrlen = sizeof(addr); + while (1) { + newfd = accept(job->gmpi_fd, (struct sockaddr *)&addr, + &addrlen); + if (newfd == -1) { + fatal("GMPI master failed to accept (abort-wait)"); + } + rlen = recv(newfd, rbuf, GMPI_RECV_BUF_LEN, 0); + if (rlen <= 0) { + error("GMPI recv (abort-wait) returned %d", rlen); + close(newfd); + continue; + } else { + rbuf[rlen] = 0; + } + if (sscanf(rbuf, "<<<ABORT_%u_ABORT>>>", &magic) != 1) { + error("GMPI (abort-wait) received spurious message."); + close(newfd); + continue; + } + if (magic != job->jobid) { + error("GMPI (abort-wait) received bad magic number."); + close(newfd); + continue; + } + close(newfd); + debug("Received ABORT message from an MPI process."); + fwd_signal(job, SIGKILL); +#if 0 + xfree(rbuf); + close(job->gmpi_fd); + job->gmpi_fd = -1; + return; +#endif + } +} + + +static void *_gmpi_thr(void *arg) +{ + job_t *job; + + job = (job_t *) arg; + + debug3("GMPI master thread pid=%lu", (unsigned long) getpid()); + _gmpi_establish_map(job); + + debug3("GMPI master thread is waiting for ABORT message."); + _gmpi_wait_abort(job); + + return (void *)0; +} + + +extern int gmpi_thr_create(job_t *job, char **port) +{ + int fd; + struct sockaddr_in addr; + char name[128]; + socklen_t namelen; + pthread_attr_t attr; + + /* + * Prepare for accepting GMPI processes. + */ + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) == -1) { + return -1; + } + bzero(&addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = htons(0); + if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + return -1; + } + if (listen(fd, 5) == -1) + return -1; + + /* + * Get the port name to communicate. + */ + namelen = sizeof(addr); + getsockname(fd, (struct sockaddr *)&addr, &namelen); + sprintf(name, "%u", ntohs(addr.sin_port)); + *port = xstrdup(name); + + /* + * Accept in a separate thread. + */ + slurm_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + job->gmpi_fd = fd; + if (pthread_create(&job->gtid, &attr, &_gmpi_thr, (void *)job)) + return -1; + debug("Started GMPI master thread (%lu)", (unsigned long) job->gtid); + + return 0; +} diff --git a/src/srun/gmpi.h b/src/srun/gmpi.h new file mode 100644 index 0000000000000000000000000000000000000000..6bd754022763c00eb8df62beda25b243c7bca07a --- /dev/null +++ b/src/srun/gmpi.h @@ -0,0 +1,47 @@ +/*****************************************************************************\ + * gmpi.h - srun support for MPICH-GM (GMPI) + ***************************************************************************** + * Copyright (C) 2004 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Takao Hatazaki <takao.hatazaki@hp.com> + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#ifndef _HAVE_GMPI_H +#define _HAVE_GMPI_H + +#include "src/srun/job.h" + +typedef struct { + int defined; + unsigned int port_board_id; + unsigned int unique_high_id; + unsigned int unique_low_id; + unsigned int numanode; + unsigned int remote_pid; + unsigned int remote_port; +} gm_slave_t; + +#define GMPI_RECV_BUF_LEN 65536 + +extern int gmpi_thr_create(job_t *job, char **port); + +#endif /* _HAVE_GMPI_H */ + diff --git a/src/srun/job.c b/src/srun/job.c index 71c5a6cb2f6ac09a9cb574062df3818aa8f8d02f..fb628786b2e461caf88168713a8173ba055f29f0 100644 --- a/src/srun/job.c +++ b/src/srun/job.c @@ -39,6 +39,7 @@ #include "src/common/cbuf.h" #include "src/common/hostlist.h" #include "src/common/log.h" +#include "src/common/read_config.h" #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_cred.h" #include "src/common/xmalloc.h" @@ -234,9 +235,10 @@ job_create_noalloc(void) job = _job_create_internal(ai); for (i = 0; i < job->nhosts; i++) { + char *nd = get_conf_node_hostname(job->host[i]); slurm_set_addr ( &job->slurmd_addr[i], - slurm_get_slurmd_port(), - job->host[i] ); + slurm_get_slurmd_port(), nd ); + xfree(nd); } _job_fake_cred(job); @@ -445,8 +447,11 @@ _compute_task_count(allocation_info_t *info) static void _set_nprocs(allocation_info_t *info) { - if (!opt.nprocs_set) + if (!opt.nprocs_set) { opt.nprocs = _compute_task_count(info); + if (opt.cpus_set) + opt.nprocs_set = true; /* implicit */ + } } @@ -476,7 +481,7 @@ _job_create_internal(allocation_info_t *info) job->nodelist = xstrdup(info->nodelist); hl = hostlist_create(job->nodelist); -#ifdef HAVE_FRONT_END +#ifdef HAVE_FRONT_END /* Limited job step support */ /* All jobs execute through front-end on Blue Gene/L. * Normally we would not permit execution of job steps, * but can fake it by just allocating all tasks to diff --git a/src/srun/job.h b/src/srun/job.h index a790b68e8e8aacea5073cff44b6777fe44e7bad7..eeadb40a387cc49ccf45d7bc98f7b368cde446e2 100644 --- a/src/srun/job.h +++ b/src/srun/job.h @@ -96,6 +96,9 @@ typedef struct srun_job { slurm_addr *slurmd_addr;/* slurm_addr vector to slurmd's */ + pthread_t gtid; /* GMPI master thread */ + int gmpi_fd; /* fd for accept(2) */ + pthread_t sigid; /* signals thread tid */ pthread_t jtid; /* job control thread id */ diff --git a/src/srun/msg.c b/src/srun/msg.c index b7b60f9fe29d1408f6100303ff216487ebff4dde..cd91062717595debd1ff830a3d2246e94b11bf1a 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -311,6 +311,8 @@ _confirm_launch_complete(job_t *job) if (job->host_state[i] != SRUN_HOST_REPLIED) { error ("Node %s not responding, terminating job step", job->host[i]); + info("sending Ctrl-C to remaining tasks"); + fwd_signal(job, SIGINT); job->rc = 124; update_job_state(job, SRUN_JOB_FAILED); pthread_exit(0); @@ -564,7 +566,7 @@ _accept_msg_connection(job_t *job, int fdnum) slurm_fd fd = (slurm_fd) NULL; slurm_msg_t *msg = NULL; slurm_addr cli_addr; - char host[256]; + unsigned char *uc; short port; int timeout = 0; /* slurm default value */ @@ -578,8 +580,12 @@ _accept_msg_connection(job_t *job, int fdnum) return; } - slurm_get_addr(&cli_addr, &port, host, sizeof(host)); - debug2("got message connection from %s:%d", host, ntohs(port)); + /* Should not call slurm_get_addr() because the IP may not be + in /etc/hosts. */ + uc = (unsigned char *)&cli_addr.sin_addr.s_addr; + port = cli_addr.sin_port; + debug2("got message connection from %u.%u.%u.%u:%d", + uc[0], uc[1], uc[2], uc[3], ntohs(port)); msg = xmalloc(sizeof(*msg)); @@ -592,7 +598,8 @@ _accept_msg_connection(job_t *job, int fdnum) if (slurm_receive_msg(fd, msg, timeout) < 0) { if (errno == EINTR) goto again; - error("slurm_receive_msg[%s]: %m", host); + error("slurm_receive_msg[%u.%u.%u.%u]: %m", + uc[0],uc[1],uc[2],uc[3]); xfree(msg); } else { diff --git a/src/srun/reattach.c b/src/srun/reattach.c index a0ae575cd2a20566c26e1192981000b0fa46cea3..49bd03c24b986a316616799aa439b379b366b16d 100644 --- a/src/srun/reattach.c +++ b/src/srun/reattach.c @@ -44,6 +44,7 @@ #include "src/common/macros.h" #include "src/common/hostlist.h" #include "src/common/slurm_protocol_api.h" +#include "src/common/read_config.h" #include "src/srun/job.h" #include "src/srun/launch.h" @@ -193,10 +194,6 @@ _get_job_info(srun_step_t *s) job_info_msg_t *resp = NULL; job_info_t *job = NULL; hostlist_t hl; -#ifdef HAVE_FRONT_END /* Fake address for front-end node */ - old_job_alloc_msg_t alloc_req; - resource_allocation_response_msg_t *alloc_resp = NULL; -#endif s->nodes = NULL; @@ -232,26 +229,11 @@ _get_job_info(srun_step_t *s) error ("Unable to create hostlist from `%s'", job->nodes); goto done; } - - rc = 0; s->nodes = hostlist_shift(hl); - s->ntasks = 1; - hostlist_destroy(hl); -#ifdef HAVE_FRONT_END /* Fake address for front-end node */ - /* now get actual node name for systems using front-end node */ - alloc_req.job_id = s->jobid; - alloc_req.uid = getuid(); - if (slurm_confirm_allocation(&alloc_req, &alloc_resp) == 0) { - uint16_t port; - free(s->nodes); - s->nodes = malloc(128); - slurm_get_addr(&alloc_resp->node_addr[0], &port, - s->nodes, 128); - slurm_free_resource_allocation_response_msg(alloc_resp); - } -#endif + s->ntasks = 1; + rc = 0; done: if (resp) @@ -312,6 +294,7 @@ _attach_to_job(job_t *job) uint16_t port = slurm_get_slurmd_port(); reattach_tasks_request_msg_t *req; slurm_msg_t *msg; + char *nd; req = xmalloc(job->nhosts * sizeof(*req)); msg = xmalloc(job->nhosts * sizeof(*msg)); @@ -339,7 +322,7 @@ _attach_to_job(job_t *job) m->data = r; m->msg_type = REQUEST_REATTACH_TASKS; - slurm_set_addr_char(&m->address, port, job->host[i]); + memcpy(&m->address, &job->slurmd_addr[i], sizeof(slurm_addr)); } _p_reattach(msg, job); diff --git a/src/srun/srun.c b/src/srun/srun.c index 6f09ed4fe400122b3e9224e9b8d70c9eebad3384..a721d181fc04e995b6957866c24adc9204c3943f 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -66,6 +66,7 @@ #include "src/srun/env.h" #include "src/srun/io.h" #include "src/srun/job.h" +#include "src/srun/gmpi.h" #include "src/srun/launch.h" #include "src/srun/msg.h" #include "src/srun/net.h" @@ -211,9 +212,11 @@ int srun(int ac, char **av) */ setenvf("SLURM_NODELIST=%s", job->nodelist); setenvf("SLURM_JOBID=%u", job->jobid); - setenvf("SLURM_NPROCS=%d", opt.nprocs); + if (opt.nprocs_set) + setenvf("SLURM_NPROCS=%d", opt.nprocs); setenvf("SLURM_NNODES=%d", job->nhosts); setenvf("SLURM_TASKS_PER_NODE=%s", task_cnt = _task_count_string (job)); + xfree(task_cnt); setenvf("SLURM_DISTRIBUTION=%s", format_distribution_t (opt.distribution)); @@ -223,7 +226,24 @@ int srun(int ac, char **av) if (bgl_part_id) setenvf("BGL_PARTITION_ID=%s", bgl_part_id); - xfree(task_cnt); + if (slurm_get_mpich_gm_dir() && getenv("GMPI_PORT") == NULL) { + /* + * It is possible for one to modify the mpirun command in + * MPICH-GM distribution so that it calls srun, instead of + * rsh, for remote process invocations. In that case, we + * should not override envs nor open the master port. + */ + char *port = NULL; + if (gmpi_thr_create(job, &port)) + job_fatal(job, "Unable to create GMPI thread"); + setenvf("GMPI_PORT=%s", port); + xfree(port); + setenvf("GMPI_SHMEM=1"); + setenvf("GMPI_MAGIC=%u", job->jobid); + setenvf("GMPI_NP=%d", opt.nprocs); + setenvf("GMPI_BOARD=-1"); /* FIXME for multi-board config. */ + setenvf("SLURM_GMPI=1"); /* mark for slurmd */ + } if (msg_thr_create(job) < 0) job_fatal(job, "Unable to create msg thread"); @@ -575,7 +595,8 @@ _set_batch_script_env(job_t *job) char *p; struct utsname name; - if (opt.nprocs_set && setenvf("SLURM_NPROCS=%u", opt.nprocs)) { + if ( opt.nprocs_set + && setenvf("SLURM_NPROCS=%u", opt.nprocs)) { error("Unable to set SLURM_NPROCS environment variable"); rc = SLURM_FAILURE; }