From 0dd3f0cf93796c66e0f16031fb044cf3bd9dc26a Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 31 Dec 2003 21:48:21 +0000 Subject: [PATCH] Switch plugin module added. While many files were modified, these modifications were relatively minor - mostly changes in function names or arguments. --- NEWS | 5 + configure.ac | 63 +- doc/html/quickstart.html | 4 +- doc/man/man5/slurm.conf.5 | 10 + etc/slurm.conf.example | 13 + slurm/slurm.h.in | 20 +- src/api/config_info.c | 3 - src/common/Makefile.am | 17 +- src/common/elanhosts.c | 387 --------- src/common/elanhosts.h | 121 --- src/common/plugin.c | 12 +- src/common/qsw.c | 1058 ----------------------- src/common/qsw.h | 100 --- src/{slurmd => common}/setenvpf.c | 2 +- src/{slurmd => common}/setenvpf.h | 2 +- src/common/slurm_protocol_defs.c | 19 +- src/common/slurm_protocol_defs.h | 9 +- src/common/slurm_protocol_pack.c | 49 +- src/common/switch.c | 390 +++++++++ src/common/switch.h | 228 +++++ src/plugins/Makefile.am | 2 +- src/slurmctld/controller.c | 5 +- src/slurmctld/job_mgr.c | 21 +- src/slurmctld/job_scheduler.c | 4 +- src/slurmctld/node_mgr.c | 7 +- src/slurmctld/proc_req.c | 30 +- src/slurmctld/read_config.c | 105 +-- src/slurmctld/slurmctld.h | 9 +- src/slurmctld/step_mgr.c | 60 +- src/slurmd/Makefile.am | 18 +- src/slurmd/elan_interconnect.c | 322 ------- src/slurmd/interconnect.h | 116 --- src/slurmd/job.c | 4 +- src/slurmd/job.h | 6 +- src/slurmd/mgr.c | 11 +- src/slurmd/no_interconnect.c | 80 -- src/slurmd/slurmd.c | 1 - src/slurmd/smgr.c | 14 +- src/slurmd/ulimits.c | 2 +- src/srun/allocate.c | 4 +- src/srun/job.h | 4 +- src/srun/launch.c | 5 +- src/srun/srun.c | 36 +- testsuite/slurm_unit/common/Makefile.am | 2 +- 44 files changed, 796 insertions(+), 2584 deletions(-) delete mode 100644 src/common/elanhosts.c delete mode 100644 src/common/elanhosts.h delete mode 100644 src/common/qsw.c delete mode 100644 src/common/qsw.h rename src/{slurmd => common}/setenvpf.c (98%) rename src/{slurmd => common}/setenvpf.h (96%) create mode 100644 src/common/switch.c create mode 100644 src/common/switch.h delete mode 100644 src/slurmd/elan_interconnect.c delete mode 100644 src/slurmd/interconnect.h delete mode 100644 src/slurmd/no_interconnect.c diff --git a/NEWS b/NEWS index 48c3ef1aff9..5fe2ff12293 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,11 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 0.3.0.0-pre6 (NOT TAGGED YET) +=============================== + -- Switch plugin added. Add "SwitchType=switch/elan" to slurm.conf for + systems with Quadrics Elan3 or Elan4 switches. + * Changes in SLURM 0.3.0.0-pre5 =============================== -- Fixes for reported problems: diff --git a/configure.ac b/configure.ac index f50f073bc04..7ad0de3a820 100644 --- a/configure.ac +++ b/configure.ac @@ -160,54 +160,32 @@ AC_SUBST(SLURMD_PORT) dnl check for whether to include Elan support dnl AC_MSG_CHECKING(whether to include Elan support) -AC_ARG_WITH(elan, - AC_HELP_STRING([--with-elan],[compile with Elan support]), - [ case "${withval}" in - yes) elan=yes ;; - no) elan=no ;; - *) AC_MSG_ERROR([bad value ${enableval} for --with-elan]) ;; - esac - ] +savedLIBS="$LIBS" +AC_CHECK_LIB([elanctrl], [elanctrl_open], + [ have_elanctrl=yes + have_elan=yes + ELAN_LIBS="-lelanctrl" + AC_DEFINE(HAVE_LIBELANCTRL, 1, + [define if you have libelanctrl.]) ], + [ have_elanctrl=no ] ) -AC_MSG_RESULT(${elan=no}) -AM_CONDITIONAL(WITH_ELAN, test "x$with_elan" = "xyes") -HAVE_ELAN=0 -if test "$with_elan" = "yes"; then - savedLIBS="$LIBS" - AC_CHECK_LIB([elanctrl], [elanctrl_open], - [ have_elanctrl=yes - ELAN_LIBS="-lelanctrl" - AC_DEFINE(HAVE_LIBELANCTRL, 1, - [define if you have libelanctrl.]) - ], - [ have_elanctrl=no - ] - ) - - AC_CHECK_LIB([elan3], [elan3_create], - [ have_elan3=yes - ELAN_LIBS="-lelan3" - AC_DEFINE(HAVE_LIBELAN3, 1, - [define if you have libelan3.]) - ], - [ have_elan3=no - ] +AC_CHECK_LIB([elan3], [elan3_create], + [ have_elan3=yes + have_elan=yes + ELAN_LIBS="-lelan3" + AC_DEFINE(HAVE_LIBELAN3, 1, + [define if you have libelan3.]) ], + [ have_elan3=no ] ) - - if test "$have_elanctrl" = "no" -a "$have_elan3" = "no"; then - AC_MSG_ERROR([Unable to find libelan3 or libelanctrl for Elan support!]) - fi - +AM_CONDITIONAL(HAVE_ELAN, test "x$have_elan" = "xyes") +if test "x$have_elan" = "xyes"; then + AC_DEFINE(HAVE_ELAN, 1, [Define to enable Elan support.]) AC_CHECK_LIB(rmscall, rms_prgcreate, - [], + [ELAN_LIBS="$ELAN_LIBS -lrmscall"], [AC_MSG_ERROR([unable to find the RMS library needed for Elan support])], ) - HAVE_ELAN=1 - AC_DEFINE(HAVE_ELAN, 1, [Define to enable Elan support.]) - ELAN_LIBS="$ELAN_LIBS -lrmscall" - LIBS="$savedLIBS" fi - +LIBS="$savedLIBS" AC_SUBST(HAVE_ELAN) AC_SUBST(ELAN_LIBS) @@ -316,6 +294,7 @@ AC_CONFIG_FILES([Makefile src/plugins/sched/backfill/Makefile src/plugins/sched/builtin/Makefile src/plugins/sched/wiki/Makefile + src/plugins/switch/Makefile doc/Makefile doc/man/Makefile testsuite/Makefile diff --git a/doc/html/quickstart.html b/doc/html/quickstart.html index 962835d7bb6..da6bc12a06d 100644 --- a/doc/html/quickstart.html +++ b/doc/html/quickstart.html @@ -248,9 +248,6 @@ The most commonly used arguments to the <i>configure</i> command include: value is <i>/usr/local</i> <dt>--sysconfdir=<i>DIR</i> <dd>Specify location of SLURM configuration file -<dt>--with-elan -<dd>Ccompile with support for the Quadrics Elan switch (see -<a href="http://www.quadrics.com">http://www.quadrics.com</a>) <dt>--with-totalview <dd>compile with support for the TotalView debugger (see <a href="http://www.etnus.com/">http://www.etnus.com</a>) @@ -304,6 +301,7 @@ SlurmdPort=7003 SlurmdSpoolDir=/var/tmp/slurmd.spool SlurmdTimeout=300 StateSaveLocation=/tmp/slurm.state +SwitchType=switch/elan # # Node Configurations # diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 7654a6ce7d9..2c292d826d6 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -267,6 +267,14 @@ The default value is "/tmp". If any slurm daemons terminate abnormally, their core files will also be written into this directory. .TP +\fBSwitchType\fR +Identifies the type of switch or interconnect used for application communications. +Acceptable values include +"switch/none" for switches not requiring special processing for job launch +or termination (Myrinet, Ethernet, and InfiniBand), +"switch/elan" for Quadrics Elan 3 or Elan 4 interconnect. +The default value is "switch/none". +.TP \fBTmpFS\fR Fully qualified pathname of the file system available to user jobs for temporary storage. This parameter is used in establishing a node's \fBTmpDisk\fR space. @@ -547,6 +555,8 @@ SlurmdSpoolDir=/usr/local/slurm/slurmd.spool .br StateSaveLocation=/usr/local/slurm/slurm.state .br +SwitchType=switch/elan +.br TmpFS=/tmp .br WaitTime=30 diff --git a/etc/slurm.conf.example b/etc/slurm.conf.example index 0ca9ffed9a6..6e2f39cf907 100644 --- a/etc/slurm.conf.example +++ b/etc/slurm.conf.example @@ -157,6 +157,7 @@ # SchedulerAuth=42 # SchedulerPort=7321 + # # o Define the job completion logging mechanism to be used # @@ -166,6 +167,18 @@ # JobCompType=jobcomp/filetxt +# +# o Define the switch or interconnect in use. +# +# "SwitchType" : the type of switch or interconnect. +# "switch/none" : the default, supports all switches not requiring +# special set-up for job launch including Myrinet, +# Ethernet, and InfiniBand. +# "switch/elan" : Quadrics Elan 3 or Elan 4 interconnect. +# +# SwitchType=switch/none + + # # o Define location where job completion logs are to be written # Interpretation of the parameter is dependent upon the logging diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index adfa83ba61e..d0ca94f116e 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -81,12 +81,10 @@ BEGIN_C_DECLS typedef struct slurm_job_credential * slurm_cred_t; #endif -/* Define qsw_jobinfo_t below to avoid including extraneous slurm headers */ -#ifdef HAVE_ELAN -# ifndef __qsw_jobinfo_t_defined -# define __qsw_jobinfo_t_defined - typedef struct qsw_jobinfo *qsw_jobinfo_t; /* opaque data type */ -# endif +/* Define switch_jobinfo_t below to avoid including extraneous slurm headers */ +#ifndef __switch_jobinfo_t_defined +# define __switch_jobinfo_t_defined + typedef struct switch_jobinfo *switch_jobinfo_t; /* opaque data type */ #endif /*****************************************************************************\ @@ -265,9 +263,7 @@ typedef struct job_step_create_response_msg { uint32_t job_step_id; /* assigned job step id */ char *node_list; /* list of allocated nodes */ slurm_cred_t cred; /* slurm job credential */ -#ifdef HAVE_ELAN - qsw_jobinfo_t qsw_job; /* Elan3 switch context, opaque data structure */ -#endif + switch_jobinfo_t switch_job; /* switch context, opaque data structure */ } job_step_create_response_msg_t; typedef struct { @@ -356,9 +352,7 @@ typedef struct resource_allocation_and_run_response_msg { uint32_t job_step_id; /* assigned step id */ slurm_cred_t cred; /* slurm job credential */ -#ifdef HAVE_ELAN - qsw_jobinfo_t qsw_job; /* Elan3 switch context, opaque data type */ -#endif + switch_jobinfo_t switch_job; /* switch context, opaque data type */ } resource_allocation_and_run_response_msg_t; typedef struct partition_info_msg { @@ -409,12 +403,12 @@ typedef struct slurm_ctl_conf { uint32_t slurmd_port; /* default communications port to slurmd */ char *slurmd_spooldir; /* where slurmd put temporary state info */ char *slurmd_pidfile; /* where to put slurmd pidfile */ - char *switch_type; /* switch or interconnect type */ uint16_t slurmd_timeout;/* how long slurmctld waits for slurmd before * considering node DOWN */ char *slurm_conf; /* pathname of slurm config file */ char *state_save_location;/* pathname of slurmctld state save * directory */ + char *switch_type; /* switch or interconnect type */ char *tmp_fs; /* pathname of temporary file system */ uint16_t wait_time; /* default job --wait time */ char *job_credential_private_key; /* path to private key */ diff --git a/src/api/config_info.c b/src/api/config_info.c index 44841599b57..b41504cac26 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -129,11 +129,8 @@ void slurm_print_ctl_conf ( FILE* out, slurm_ctl_conf_ptr->slurm_conf); fprintf(out, "StateSaveLocation = %s\n", slurm_ctl_conf_ptr->state_save_location); -#if 0 -Not quite ready to check in fprintf(out, "SwitchType = %s\n", slurm_ctl_conf_ptr->switch_type); -#endif fprintf(out, "TmpFS = %s\n", slurm_ctl_conf_ptr->tmp_fs); fprintf(out, "WaitTime = %u\n", diff --git a/src/common/Makefile.am b/src/common/Makefile.am index fa75335cd20..6bc4a9d11e3 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -5,12 +5,6 @@ AUTOMAKE_OPTIONS = foreign INCLUDES = -I$(top_srcdir) $(SSL_CPPFLAGS) -if WITH_ELAN -elan_sources = qsw.c qsw.h elanhosts.c elanhosts.h -else -elan_sources = -endif - noinst_LTLIBRARIES = \ libcommon.la \ libdaemonize.la \ @@ -34,6 +28,7 @@ libcommon_la_SOURCES = \ plugin.c plugin.h \ plugrack.c plugrack.h \ read_config.c read_config.h \ + setenvpf.c setenvpf.h \ slurm_cred.h \ slurm_cred.c \ slurm_errno.c \ @@ -53,10 +48,10 @@ libcommon_la_SOURCES = \ util-net.c util-net.h \ slurm_auth.c slurm_auth.h \ slurm_jobcomp.c slurm_jobcomp.h \ + switch.c switch.h \ arg_desc.c arg_desc.h \ macros.h \ - hostlist.c hostlist.h \ - $(elan_sources) + hostlist.c hostlist.h libdaemonize_la_SOURCES = \ daemonize.c \ @@ -67,9 +62,5 @@ libeio_la_SOURCES = \ eio.c eio.h \ io_hdr.c io_hdr.h -EXTRA_libcommon_la_SOURCES = \ - qsw.c qsw.h \ - elanhosts.c elanhosts.h - -libcommon_la_LIBADD = $(SSL_LIBS) $(ELAN_LIBS) -ldl +libcommon_la_LIBADD = $(SSL_LIBS) -ldl libcommon_la_LDFLAGS = $(SSL_LDFLAGS) diff --git a/src/common/elanhosts.c b/src/common/elanhosts.c deleted file mode 100644 index 47fcda1c210..00000000000 --- a/src/common/elanhosts.c +++ /dev/null @@ -1,387 +0,0 @@ -/*****************************************************************************\ - * $Id$ - ***************************************************************************** - * Copyright (C) 2001-2002 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Mark Grondona <mgrondona@llnl.gov>. - * UCRL-CODE-2003-005. - * - * This file is part of Pdsh, a parallel remote shell program. - * For details, see <http://www.llnl.gov/linux/pdsh/>. - * - * Pdsh is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with Pdsh; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. -\*****************************************************************************/ - -#if HAVE_CONFIG_H -#include "config.h" -#endif - -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <assert.h> -#include <stdarg.h> -#include <stdlib.h> - -#include "src/common/list.h" -#include "src/common/hostlist.h" -#include "elanhosts.h" - -/* Default ElanId config file */ -#define ELANID_CONFIG_FILE "/etc/elanhosts" - -/* - * Error strings for error codes returned by parse_elanid_config() - */ -static char *errstr[] = -{ "No error", - "Out of memory!", - "Parse error", - "Number of ElanIds specified != number of hosts", - "Type must be \"eip\" \"eth\" or \"other\"", - NULL -}; - -/* - * Container for converting hostnames to ElanIDs - */ -struct elan_info { - elanhost_type_t type; /* type of entry */ - int elanid; /* ElanID corresponding to this hostname */ - char *hostname; /* Resolveable hostname */ -}; - -struct elanhost_config { -#ifndef NDEBUG - int magic; -# define ELANHOST_CONFIG_MAGIC 0xe100e100 -#endif - int maxid; /* Storage for max ElanID in config */ - List elanid_list; /* List of elan_info objects describing configuration */ - char errstr[1024]; /* String describing last error from this object */ -}; - - -/* - * Static Prototypes: - */ -static elanhost_config_t _elanhost_config_alloc(void); -static void _elanhost_err(elanhost_config_t ec, const char *fmt, ...); -static int _find_host(struct elan_info *ei, char *key); -static int _parse_elanid_config(elanhost_config_t ec, const char *path); -static int _parse_elanid_line(elanhost_config_t ec, char *buf); -static struct elan_info * _elan_info_create(elanhost_type_t type, - int elanid, char *hostname); -static void _elan_info_destroy(struct elan_info *ei); - - -elanhost_config_t elanhost_config_create() -{ - return _elanhost_config_alloc(); -} - - -int elanhost_config_read(elanhost_config_t ec, const char *filename) -{ - assert(ec != NULL); - assert(ec->magic == ELANHOST_CONFIG_MAGIC); - assert(ec->elanid_list != NULL); - - if (filename == NULL) - filename = ELANID_CONFIG_FILE; - - if (_parse_elanid_config(ec, filename) < 0) - return(-1); - - return(0); -} - -void elanhost_config_destroy(elanhost_config_t ec) -{ - assert(ec != NULL); - assert(ec->magic == ELANHOST_CONFIG_MAGIC); - list_destroy(ec->elanid_list); - assert(ec->magic = ~ELANHOST_CONFIG_MAGIC); - free(ec); -} - -int elanhost_config_maxid(elanhost_config_t ec) -{ - assert(ec != NULL); - assert(ec->magic == ELANHOST_CONFIG_MAGIC); - - return ec->maxid; -} - -int elanhost_host2elanid(elanhost_config_t ec, char *host) -{ - struct elan_info *ei; - - assert(ec != NULL); - assert(host != NULL); - assert(ec->magic == ELANHOST_CONFIG_MAGIC); - - ei = list_find_first(ec->elanid_list, (ListFindF) _find_host, host); - - if (!ei) { - _elanhost_err(ec, "Unable to find host \"%s\" in configuration", host); - return -1; - } - - return ei->elanid; -} - -const char *elanhost_config_err(elanhost_config_t ec) -{ - return ec->errstr; -} - - -struct elanid_find_arg { - elanhost_type_t type; - int elanid; -}; - -static int _find_elanid(struct elan_info *ei, struct elanid_find_arg *arg) -{ - if (ei->type != arg->type) - return 0; - - if (ei->elanid != arg->elanid) - return 0; - - return 1; -} - -char *elanhost_elanid2host(elanhost_config_t ec, elanhost_type_t type, int eid) -{ - struct elan_info *ei; - struct elanid_find_arg arg; - - assert(ec != NULL); - assert(eid >= 0); - assert(ec->magic == ELANHOST_CONFIG_MAGIC); - - arg.type = type; - arg.elanid = eid; - - ei = list_find_first(ec->elanid_list, (ListFindF) _find_elanid, &arg); - - if (!ei) { - _elanhost_err(ec, "Unable to find host with type=%d elanid=%d", - type, eid); - return(NULL); - } - - return ei->hostname; -} - -static elanhost_config_t _elanhost_config_alloc(void) -{ - elanhost_config_t new = malloc(sizeof(*new)); - - new->maxid = -1; - new->elanid_list = list_create((ListDelF) _elan_info_destroy); - - assert(new->magic = ELANHOST_CONFIG_MAGIC); - - return new; -} - -static void _elanhost_err(elanhost_config_t ec, const char *fmt, ...) -{ - va_list ap; - - assert(ec != NULL); - assert(fmt != NULL); - - va_start(ap, fmt); - vsnprintf(ec->errstr, 1024, fmt, ap); - va_end(ap); - - return; -} - -/* - * Parse the "elanhosts" config file which has the form - * - * ElanIds Hostnames - * [n-m] host_n,...,host_m - * [n-m] host[n-m] - * etc. - * - * and which maps ElanIds to hostnames on the cluster. - * The results are stored in the config object's elanid_list member. - * - * Returns 0 on Success, and an error code < 0 on failure. - */ -static int _parse_elanid_config(elanhost_config_t ec, const char *path) -{ - char buf[4096]; - int line; - FILE *fp; - - if (!(fp = fopen(path, "r"))) { - _elanhost_err(ec, "failed to open %s\n", path); - return -1; - } - - line = 1; - while (fgets(buf, 4096, fp)) { - int rc; - if ((rc = _parse_elanid_line(ec, buf)) < 0) { - _elanhost_err(ec, "%s: line %d: %s", path, line, errstr[-rc]); - return -1; - } - line++; - } - - if (fclose(fp) < 0) - _elanhost_err(ec, "close(%s): %m", path); - - return 0; -} - - -/* - * Translate type strings "eip," "eth," or "other" into their - * corresponding elanhost_type_t number - */ -static elanhost_type_t _get_type_num(char *type) -{ - if (strcasecmp(type, "eip") == 0) - return ELANHOST_EIP; - else if (strcasecmp(type, "eth") == 0) - return ELANHOST_ETH; - else if (strcasecmp(type, "other") == 0) - return ELANHOST_OTHER; - else - return -1; -} - -/* - * Parse one line of elanId list appending results to list "eil" - * - * Returns -1 for parse error, -2 if the number of elanids specified - * doesn't equal the number of hosts. - * - * Returns 0 on success - */ -static int -_parse_elanid_line(elanhost_config_t ec, char *buf) -{ - hostlist_t el, hl; - const char *separators = " \t\n"; - char *type; - char *elanids; - char *hosts; - char *sp, *s; - int rc = 0; - int typenum; - - /* - * Nullify any comments - */ - if ((s = strchr(buf, '#'))) - *s = '\0'; - - if (!(type = strtok_r(buf, separators, &sp))) - return 0; - - if (!(elanids = strtok_r(NULL, separators, &sp))) - return -1; - - if (!(hosts = strtok_r(NULL, separators, &sp))) - return -2; - - el = hostlist_create(NULL); - hl = hostlist_create(NULL); - - if (!el || !hl) { - rc = -1; - goto done; - } - - if (hostlist_push(el, elanids) != hostlist_push(hl, hosts)) { - rc = -3; - goto done; - } - - if ((typenum = _get_type_num(type)) < 0) - return -4; - - while ((s = hostlist_shift(el))) { - char *eptr; - int elanid = (int) strtoul(s, &eptr, 10); - - if (*eptr != '\0') { - rc = -2; - goto done; - } - - free(s); - if (!(s = hostlist_shift(hl))) { - rc = -1; - goto done; - } - - if (elanid > ec->maxid) - ec->maxid = elanid; - - list_append(ec->elanid_list, _elan_info_create(typenum, elanid, s)); - } - - done: - hostlist_destroy(el); - hostlist_destroy(hl); - - return rc; -} - -static struct elan_info * -_elan_info_create(elanhost_type_t type, int elanid, char *hostname) -{ - struct elan_info *ei = (struct elan_info *) malloc(sizeof(*ei)); - ei->type = type; - ei->elanid = elanid; - ei->hostname = hostname; - return ei; -} - -static void -_elan_info_destroy(struct elan_info *ei) -{ - if (ei->hostname) - free(ei->hostname); - free(ei); -} - - -/* - * List Find function for mapping hostname to an ElanId - */ -static int _find_host(struct elan_info *ei, char *key) -{ - if (strcmp(ei->hostname, key) != 0) - return 0; - else - return 1; -} - - -/* - * vi:tabstop=4 shiftwidth=4 expandtab - */ - diff --git a/src/common/elanhosts.h b/src/common/elanhosts.h deleted file mode 100644 index d5cb0bb6526..00000000000 --- a/src/common/elanhosts.h +++ /dev/null @@ -1,121 +0,0 @@ -/*****************************************************************************\ - * $Id$ - ***************************************************************************** - * Copyright (C) 2001-2002 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Mark Grondona <mgrondona@llnl.gov>. - * UCRL-CODE-2003-005. - * - * This file is part of Pdsh, a parallel remote shell program. - * For details, see <http://www.llnl.gov/linux/pdsh/>. - * - * Pdsh is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * Pdsh is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with Pdsh; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. -\*****************************************************************************/ - -#ifndef _ELANHOSTS_H -#define _ELANHOSTS_H - -/* - * Type of Elan "hostname" - * Hostname corresponds to the eip adapter, an ethernet adapter, or "other" - */ -typedef enum { - ELANHOST_EIP, - ELANHOST_ETH, - ELANHOST_OTHER -} elanhost_type_t; - -/* Opaque type which holds the elanhost configuration - */ -typedef struct elanhost_config * elanhost_config_t; - - -/* - * Functions - */ - -/* - * Create an empty Elanhost config object - */ -elanhost_config_t elanhost_config_create(void); - -/* - * Read elanhosts configuration from `file' - * (Default /etc/elanhosts) - * - * Config file format is as follows: - * - * Type ElanIDs Hostnames - * - * The "type" field may be "eip" for eip interface, "eth" for an - * ethernet interface, or "other" for anything else. ("eth" and - * "other" are equivalent at this time) - * - * The "ElanIDs" field consists of a list of one or more ElanIDs in - * the form "[i-j,n-m,..]" or just "N" for a single ElanID. - * - * The "Hostname" field consists of the hostnames which correspond - * to the ElanIDs. If the hostnames have a numeric suffix a bracketed - * hostlist is allowed (see hostlist.[ch]) - * - * For Example: - * - * Type ElanIDs Hostnames - * eip [0-10] host[0-10] - * eth [0-10] ehost[0-10] - * eth [0,1] host0-eth1,host1-eth1 - * - * Returns 0 on succes, -1 for failure. - * - */ -int elanhost_config_read(elanhost_config_t ec, const char *filename); - - -/* - * Destroy an elanhost configuration object. - */ -void elanhost_config_destroy(elanhost_config_t conf); - - -/* - * Given a hostname, return the corresponding ElanID - * - * Returns the ElanId on success, -1 if no host matching "hostname" - * was found in the configuration. - * - */ -int elanhost_host2elanid(elanhost_config_t ec, char *host); - - -/* - * Given an ElanId and adapter type, return the first matching hostname - * from the configuration. - */ -char *elanhost_elanid2host(elanhost_config_t ec, - elanhost_type_t type, int elanid); - - -/* - * Returns the max ElanID from the configuration - */ -int elanhost_config_maxid(elanhost_config_t ec); - - -/* - * Returns the last error string generated for the elan config obj `ec' - */ -const char *elanhost_config_err(elanhost_config_t ec); - -#endif diff --git a/src/common/plugin.c b/src/common/plugin.c index 56fa3d53ac4..a48b4773f26 100644 --- a/src/common/plugin.c +++ b/src/common/plugin.c @@ -60,7 +60,8 @@ plugin_peek( const char *fq_path, } } else { dlclose( plug ); - error( "%s: not a SLURM plugin", fq_path ); + /* could be vestigial library, don't treat as an error */ + verbose( "%s: not a SLURM plugin", fq_path ); return SLURM_ERROR; } if ( ( version = (uint32_t *) dlsym( plug, PLUGIN_VERSION ) ) != NULL ) { @@ -69,7 +70,8 @@ plugin_peek( const char *fq_path, } } else { dlclose( plug ); - error( "%s: not a SLURM plugin", fq_path ); + /* could be vestigial library, don't treat as an error */ + verbose( "%s: not a SLURM plugin", fq_path ); return SLURM_ERROR; } @@ -93,7 +95,7 @@ plugin_load_from_file( const char *fq_path ) */ plug = dlopen( fq_path, RTLD_NOW ); if ( plug == NULL ) { - debug2( "plugin_load_from_file: dlopen(%s): %s", + debug( "plugin_load_from_file: dlopen(%s): %s", fq_path, dlerror() ); return PLUGIN_INVALID_HANDLE; @@ -103,6 +105,7 @@ plugin_load_from_file( const char *fq_path ) if ( ( dlsym( plug, PLUGIN_NAME ) == NULL ) || ( dlsym( plug, PLUGIN_TYPE ) == NULL ) || ( dlsym( plug, PLUGIN_VERSION ) == NULL ) ) { + debug( "plugin_load_from_file: invalid symbol"); /* slurm_seterrno( SLURM_PLUGIN_SYMBOLS ); */ return PLUGIN_INVALID_HANDLE; } @@ -113,7 +116,8 @@ plugin_load_from_file( const char *fq_path ) */ if ( ( init = dlsym( plug, "init" ) ) != NULL ) { if ( (*init)() != 0 ) { - debug( "plugin_load_from_file(%s): init() returned SLURM_ERROR", fq_path ); + debug( "plugin_load_from_file(%s): init() returned SLURM_ERROR", + fq_path ); (void) dlclose( plug ); return PLUGIN_INVALID_HANDLE; } diff --git a/src/common/qsw.c b/src/common/qsw.c deleted file mode 100644 index e942834f531..00000000000 --- a/src/common/qsw.c +++ /dev/null @@ -1,1058 +0,0 @@ -/*****************************************************************************\ - * qsw.c - Library routines for initiating jobs on QsNet. - ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Jim Garlick <garlick@llnl.gov> - * UCRL-CODE-2002-040. - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. -\*****************************************************************************/ - -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#ifdef WITH_PTHREADS -# include <pthread.h> -#endif /* WITH_PTHREADS */ - -#include <sys/param.h> -#include <sys/types.h> -#include <sys/wait.h> -#include <syslog.h> -#include <errno.h> -#include <string.h> -#include <paths.h> -#include <stdarg.h> -#include <ctype.h> -#include <assert.h> -#include <stdlib.h> -#include <unistd.h> -#include <limits.h> /* INT_MAX */ -#include <stdio.h> - - -#if HAVE_LIBELANCTRL -# include <elan/elanctrl.h> -# include <elan/capability.h> - -/* These are taken from elan3/elanvp.h, which we don't - * want to include here since we are using the new - * version-nonspecific libelanctrl. - * (XXX: What is the equivalent in libelanctrl?) - */ -# define ELAN_USER_BASE_CONTEXT_NUM 0x020 -# define ELAN_USER_TOP_CONTEXT_NUM 0x7ff - -# define Version cap_version -# define HighNode cap_highnode -# define LowNode cap_lownode -# define HighContext cap_highcontext -# define LowContext cap_lowcontext -# define MyContext cap_mycontext -# define Bitmap cap_bitmap -# define Type cap_type -# define UserKey cap_userkey -# define RailMask cap_railmask -# define Values key_values -#elif HAVE_LIBELAN3 -# include <elan3/elan3.h> -# include <elan3/elanvp.h> -#else -# error "Must have either libelan3 or libelanctrl to compile this module!" -#endif /* HAVE_LIBELANCTRL */ - -#include <rms/rmscall.h> - -#include <slurm/slurm_errno.h> - -#include "src/common/elanhosts.h" -#include "src/common/xassert.h" -#include "src/common/strlcpy.h" -#include "src/common/bitstring.h" -#include "src/common/log.h" -#include "src/common/pack.h" -#include "src/common/qsw.h" - -/* - * Definitions local to this module. - */ -#define QSW_JOBINFO_MAGIC 0xf00ff00e -#define QSW_LIBSTATE_MAGIC 0xf00ff00f - -/* we will allocate program descriptions in this range */ -/* XXX note: do not start at zero as libelan shifts to get unique shm id */ -#define QSW_PRG_START 1 -#define QSW_PRG_END INT_MAX -#define QSW_PRG_INVAL (-1) - -/* we allocate elan hardware context numbers in this range */ -#define QSW_CTX_START ELAN_USER_BASE_CONTEXT_NUM - -/* XXX: Temporary workaround for slurm/222 (qws sw-kernel/5478) - * (sys_validate_cap does not allow ELAN_USER_TOP_CONTEXT_NUM) - */ -#define QSW_CTX_END ELAN_USER_TOP_CONTEXT_NUM - 1 -#define QSW_CTX_INVAL (-1) - -/* - * We are going to some trouble to keep these defs private so slurm - * hackers not interested in the interconnect details can just pass around - * the opaque types. All use of the data structure internals is local to this - * module. - */ -struct qsw_libstate { - int ls_magic; - int ls_prognum; - int ls_hwcontext; -}; - -struct qsw_jobinfo { - int j_magic; - int j_prognum; - ELAN_CAPABILITY j_cap; -}; - -/* Copy library state */ -#define _copy_libstate(dest, src) do { \ - assert((src)->ls_magic == QSW_LIBSTATE_MAGIC); \ - assert((dest)->ls_magic == QSW_LIBSTATE_MAGIC); \ - memcpy(dest, src, sizeof(struct qsw_libstate)); \ -} while (0) - -/* Lock on library state */ -#define _lock_qsw() do { \ - int err; \ - err = pthread_mutex_lock(&qsw_lock); \ - assert(err == 0); \ -} while (0) -#define _unlock_qsw() do { \ - int err; \ - err = pthread_mutex_unlock(&qsw_lock); \ - assert(err == 0); \ -} while (0) - -/* - * Globals - */ -static qsw_libstate_t qsw_internal_state = NULL; -static pthread_mutex_t qsw_lock = PTHREAD_MUTEX_INITIALIZER; -static elanhost_config_t elanconf = NULL; - - -/* - * Allocate a qsw_libstate_t. - * lsp (IN) store pointer to new instantiation here - * RETURN 0 on success, -1 on failure (sets errno) - */ -int -qsw_alloc_libstate(qsw_libstate_t *lsp) -{ - qsw_libstate_t new; - - assert(lsp != NULL); - new = (qsw_libstate_t)malloc(sizeof(struct qsw_libstate)); - if (!new) - slurm_seterrno_ret(ENOMEM); - new->ls_magic = QSW_LIBSTATE_MAGIC; - *lsp = new; - return 0; -} - -/* - * Free a qsw_libstate_t. - * ls (IN) qsw_libstate_t to free - */ -void -qsw_free_libstate(qsw_libstate_t ls) -{ - assert(ls->ls_magic == QSW_LIBSTATE_MAGIC); - ls->ls_magic = 0; - free(ls); -} - -/* - * Pack libstate structure in a format that can be shipped over the - * network and unpacked on a different architecture. - * ls (IN) libstate structure to be packed - * buffer (IN/OUT) where to store packed data - * RETURN #bytes unused in 'data' - */ -int -qsw_pack_libstate(qsw_libstate_t ls, Buf buffer) -{ - int offset; - - assert(ls->ls_magic == QSW_LIBSTATE_MAGIC); - offset = get_buf_offset(buffer); - - pack32(ls->ls_magic, buffer); - pack32(ls->ls_prognum, buffer); - pack32(ls->ls_hwcontext, buffer); - - return (get_buf_offset(buffer) - offset); -} - -/* - * Unpack libstate packed by qsw_pack_libstate. - * ls (IN/OUT) where to put libstate structure - * buffer (IN/OUT) where to get packed data - * RETURN #bytes unused or -1 on error (sets errno) - */ -int -qsw_unpack_libstate(qsw_libstate_t ls, Buf buffer) -{ - int offset; - - assert(ls->ls_magic == QSW_LIBSTATE_MAGIC); - offset = get_buf_offset(buffer); - - safe_unpack32(&ls->ls_magic, buffer); - safe_unpack32(&ls->ls_prognum, buffer); - safe_unpack32(&ls->ls_hwcontext, buffer); - - if (ls->ls_magic != QSW_LIBSTATE_MAGIC) - goto unpack_error; - - return SLURM_SUCCESS; - - unpack_error: - slurm_seterrno_ret(EBADMAGIC_QSWLIBSTATE); /* corrupted libstate */ - return SLURM_ERROR; -} - -/* - * Seed the random number generator. This can be called multiple times, - * but srand48 will only be called once per program invocation. - */ -static void -_srand_if_needed(void) -{ - static int done = 0; - - if (!done) { - srand48(getpid()); - done = 1; - } -} - -/* - * Initialize this library, optionally restoring a previously saved state. - * oldstate (IN) old state retrieved from qsw_fini() or NULL - * RETURN 0 on success, -1 on failure (sets errno) - */ -int -qsw_init(qsw_libstate_t oldstate) -{ - qsw_libstate_t new; - - assert(qsw_internal_state == NULL); - _srand_if_needed(); - if (qsw_alloc_libstate(&new) < 0) - return -1; /* errno set by qsw_alloc_libstate */ - if (oldstate) - _copy_libstate(new, oldstate); - else { - new->ls_prognum = QSW_PRG_START; - new->ls_hwcontext = QSW_CTX_START; - } - qsw_internal_state = new; - return 0; -} - -/* - * Finalize use of this library. If 'savestate' is non-NULL, final - * state is copied there before it is destroyed. - * savestate (OUT) place to put state - */ -void -qsw_fini(qsw_libstate_t savestate) -{ - assert(qsw_internal_state != NULL); - _lock_qsw(); - if (savestate) - _copy_libstate(savestate, qsw_internal_state); - qsw_free_libstate(qsw_internal_state); - qsw_internal_state = NULL; - _unlock_qsw(); -} - -/* - * Allocate a qsw_jobinfo_t. - * jp (IN) store pointer to new instantiation here - * RETURN 0 on success, -1 on failure (sets errno) - */ -int -qsw_alloc_jobinfo(qsw_jobinfo_t *jp) -{ - qsw_jobinfo_t new; - - assert(jp != NULL); - new = (qsw_jobinfo_t)malloc(sizeof(struct qsw_jobinfo)); - if (!new) - slurm_seterrno_ret(ENOMEM); - new->j_magic = QSW_JOBINFO_MAGIC; - - *jp = new; - return 0; -} - -/* - * Make a copy of a qsw_jobinfo_t. - * j (IN) qsw_jobinfo_t to be copied - * RETURN qsw_jobinfo_t on success, NULL on failure - */ -qsw_jobinfo_t -qsw_copy_jobinfo(qsw_jobinfo_t j) -{ - qsw_jobinfo_t new; - if (qsw_alloc_jobinfo(&new)) - return NULL; - memcpy(new, j, sizeof(struct qsw_jobinfo)); - - return new; -} - -/* - * Free a qsw_jobinfo_t. - * ls (IN) qsw_jobinfo_t to free - */ -void -qsw_free_jobinfo(qsw_jobinfo_t j) -{ - if (j == NULL) - return; - assert(j->j_magic == QSW_JOBINFO_MAGIC); - j->j_magic = 0; - free(j); -} - -/* - * Pack jobinfo structure in a format that can be shipped over the - * network and unpacked on a different architecture. - * j (IN) jobinfo structure to be packed - * buffer (OUT) where to store packed data - * RETURN #bytes unused in 'data' or -1 on error (sets errno) - * NOTE: Keep in sync with QSW_PACK_SIZE above - */ -int -qsw_pack_jobinfo(qsw_jobinfo_t j, Buf buffer) -{ - int i, offset; - - assert(j->j_magic == QSW_JOBINFO_MAGIC); - offset = get_buf_offset(buffer); - - pack32(j->j_magic, buffer); - pack32(j->j_prognum, buffer); - for (i = 0; i < 4; i++) - pack32(j->j_cap.UserKey.Values[i], buffer); - pack16(j->j_cap.Type, buffer); -#if HAVE_LIBELANCTRL -# ifdef ELAN_CAP_ELAN3 - pack16(j->j_cap.cap_elan_type, buffer); -# else - j->j_cap.cap_spare = ELAN_CAP_UNINITIALISED; - pack16(j->j_cap.cap_spare, buffer); -# endif -#endif -#if HAVE_LIBELAN3 - pack16(j->j_cap.padding, buffer); -#endif - pack32(j->j_cap.Version, buffer); - pack32(j->j_cap.LowContext, buffer); - pack32(j->j_cap.HighContext, buffer); - pack32(j->j_cap.MyContext, buffer); - pack32(j->j_cap.LowNode, buffer); - pack32(j->j_cap.HighNode, buffer); -#if HAVE_LIBELAN3 - pack32(j->j_cap.Entries, buffer); -#endif - pack32(j->j_cap.RailMask, buffer); - for (i = 0; i < ELAN_BITMAPSIZE; i++) - pack32(j->j_cap.Bitmap[i], buffer); - - return (get_buf_offset(buffer) - offset); -} - -/* - * Unpack jobinfo structure packed by qsw_pack_jobinfo. - * j (IN/OUT) where to store libstate structure - * buffer (OUT) where to load packed data - * RETURN #bytes unused in 'data' or -1 on error (sets errno) - */ -int -qsw_unpack_jobinfo(qsw_jobinfo_t j, Buf buffer) -{ - int i, offset; - - assert(j->j_magic == QSW_JOBINFO_MAGIC); - offset = get_buf_offset(buffer); - - safe_unpack32(&j->j_magic, buffer); - safe_unpack32(&j->j_prognum, buffer); - for (i = 0; i < 4; i++) - safe_unpack32(&j->j_cap.UserKey.Values[i], buffer); - safe_unpack16(&j->j_cap.Type, buffer); -#if HAVE_LIBELANCTRL -# ifdef ELAN_CAP_ELAN3 - safe_unpack16(&j->j_cap.cap_elan_type, buffer); -# else - safe_unpack16(&j->j_cap.cap_spare, buffer); -# endif -#endif -#if HAVE_LIBELAN3 - safe_unpack16(&j->j_cap.padding, buffer); -#endif - safe_unpack32(&j->j_cap.Version, buffer); - safe_unpack32(&j->j_cap.LowContext, buffer); - safe_unpack32(&j->j_cap.HighContext, buffer); - safe_unpack32(&j->j_cap.MyContext, buffer); - safe_unpack32(&j->j_cap.LowNode, buffer); - safe_unpack32(&j->j_cap.HighNode, buffer); -#if HAVE_LIBELAN3 - safe_unpack32(&j->j_cap.Entries, buffer); -#endif - safe_unpack32(&j->j_cap.RailMask, buffer); - for (i = 0; i < ELAN_BITMAPSIZE; i++) - safe_unpack32(&j->j_cap.Bitmap[i], buffer); - - if (j->j_magic != QSW_JOBINFO_MAGIC) - goto unpack_error; - - return SLURM_SUCCESS; - - unpack_error: - slurm_seterrno_ret(EBADMAGIC_QSWJOBINFO); - return SLURM_ERROR; -} - -/* - * Allocate a program description number. Program descriptions, which are the - * key abstraction maintained by the rms.o kernel module, must not be used - * more than once simultaneously on a single node. We allocate one to each - * parallel job which more than meets this requirement. A program description - * can be compared to a process group, except there is no way for a process to - * disassociate itself or its children from the program description. - * If the library is initialized, we allocate these consecutively, otherwise - * we generate a random one, assuming we are being called by a transient - * program like pdsh. Ref: rms_prgcreate(3). - */ -static int -_generate_prognum(void) -{ - int new; - - if (qsw_internal_state) { - _lock_qsw(); - new = qsw_internal_state->ls_prognum; - if (new == QSW_PRG_END) - qsw_internal_state->ls_prognum = QSW_PRG_START; - else - qsw_internal_state->ls_prognum++; - _unlock_qsw(); - } else { - _srand_if_needed(); - new = lrand48() % (QSW_PRG_END - QSW_PRG_START + 1); - new += QSW_PRG_START; - } - return new; -} - -/* - * Elan hardware context numbers are an adapter resource that must not be used - * more than once on a single node. One is allocated to each process on the - * node that will be communication over Elan. In order for processes on the - * same node to communicate with one another and with other nodes across QsNet, - * they must use contexts in the hi-lo range of a common capability. - * If the library is initialized, we allocate these consecutively, otherwise - * we generate a random one, assuming we are being called by a transient - * program like pdsh. Ref: rms_setcap(3). - */ -static int -_generate_hwcontext(int num) -{ - int new; - - if (qsw_internal_state) { - _lock_qsw(); - if (qsw_internal_state->ls_hwcontext + num - 1 > QSW_CTX_END) - qsw_internal_state->ls_hwcontext = QSW_CTX_START; - new = qsw_internal_state->ls_hwcontext; - qsw_internal_state->ls_hwcontext += num; - _unlock_qsw(); - } else { - _srand_if_needed(); - new = lrand48() % - (QSW_CTX_END - (QSW_CTX_START + num - 1) - 1); - new += QSW_CTX_START; - } - return new; -} - -/* - * Initialize the elan capability for this job. - */ -static void -_init_elan_capability(ELAN_CAPABILITY *cap, int nprocs, int nnodes, - bitstr_t *nodeset, int cyclic_alloc) -{ - int i, node_num, full_node_cnt, min_procs_per_node, max_procs_per_node; - - /* Task count may not be identical for all nodes */ - full_node_cnt = nprocs % nnodes; - min_procs_per_node = nprocs / nnodes; - max_procs_per_node = (nprocs + nnodes - 1) / nnodes; - - _srand_if_needed(); - - /* start with a clean slate */ -#if HAVE_LIBELANCTRL - elan_nullcap(cap); -#else - elan3_nullcap(cap); -#endif - - /* initialize for single rail and either block or cyclic allocation */ - if (cyclic_alloc) - cap->Type = ELAN_CAP_TYPE_CYCLIC; - else - cap->Type = ELAN_CAP_TYPE_BLOCK; - cap->Type |= ELAN_CAP_TYPE_MULTI_RAIL; - cap->RailMask = 1; - -#if HAVE_LIBELANCTRL -# ifdef ELAN_CAP_ELAN3 - cap->cap_elan_type = ELAN_CAP_ELAN3; -# else - cap->cap_spare = ELAN_CAP_UNINITIALISED; -# endif -#endif - - /* UserKey is 128 bits of randomness which should be kept private */ - for (i = 0; i < 4; i++) - cap->UserKey.Values[i] = lrand48(); - - /* set up hardware context range */ - cap->LowContext = _generate_hwcontext(max_procs_per_node); - cap->HighContext = cap->LowContext + max_procs_per_node - 1; - /* Note: not necessary to initialize cap->MyContext */ - - /* set the range of nodes to be used and number of processes */ - cap->LowNode = bit_ffs(nodeset); - assert(cap->LowNode != -1); - cap->HighNode = bit_fls(nodeset); - assert(cap->HighNode != -1); - -#if HAVE_LIBELAN3 - cap->Entries = nprocs; -#endif - -#if USE_OLD_LIBELAN - /* set the hw broadcast bit if consecutive nodes */ - if (abs(cap->HighNode - cap->LowNode) == nnodes - 1) - cap->Type |= ELAN_CAP_TYPE_BROADCASTABLE; -#else - /* set unconditionally per qsw gnat sw-elan/4334 */ - /* only time we don't want this is unsupported rev A hardware */ - cap->Type |= ELAN_CAP_TYPE_BROADCASTABLE; -#endif - /* - * Set up cap->Bitmap, which describes the mapping of processes to - * the nodes in the range of cap->LowNode - cap->Highnode. - * There are (nprocs * nnodes) significant bits in the mask, each - * representing a process slot. Bits are off for process slots - * corresponding to unallocated nodes. For example, if nodes 4 and 6 - * are running two processes per node, bits 0,1 (corresponding to the - * two processes on node 4) and bits 4,5 (corresponding to the two - * processes running on node 6) are set. - */ - node_num = 0; - for (i = cap->LowNode; i <= cap->HighNode; i++) { - if (bit_test(nodeset, i)) { - int j, bit, task_cnt; - - if (node_num++ < full_node_cnt) - task_cnt = max_procs_per_node; - else - task_cnt = min_procs_per_node; - - for (j = 0; j < task_cnt; j++) { - if (cyclic_alloc) - bit = (i-cap->LowNode) + ( j * - (cap->HighNode - cap->LowNode + 1)); - else - bit = ((i-cap->LowNode) - * max_procs_per_node) + j; - - assert(bit < (sizeof(cap->Bitmap) * 8)); - BT_SET(cap->Bitmap, bit); - } - } - } -} - -/* - * Create all the QsNet related information needed to set up a QsNet parallel - * program and store it in the qsw_jobinfo struct. - * Call this on the "client" process, e.g. pdsh, srun, slurmctld, etc.. - */ -int -qsw_setup_jobinfo(qsw_jobinfo_t j, int nprocs, bitstr_t *nodeset, - int cyclic_alloc) -{ - int nnodes = bit_set_count(nodeset); - - assert(j != NULL); - assert(j->j_magic == QSW_JOBINFO_MAGIC); - - /* sanity check on args */ - /* Note: ELAN_MAX_VPS is 512 on "old" Elan driver, 16384 on new. */ - if ((nprocs <= 0) || (nprocs > ELAN_MAX_VPS) || (nnodes <= 0)) { - slurm_seterrno_ret(EINVAL); - } - - /* initialize jobinfo */ - j->j_prognum = _generate_prognum(); - _init_elan_capability(&j->j_cap, nprocs, nnodes, nodeset, - cyclic_alloc); - - return 0; -} - -/* - * Here are the necessary steps to set up to run an Elan MPI parallel program - * (set of processes) on a node (possibly one of many allocated to the prog): - * - * Process 1 Process 2 | Process 3 - * read args | - * fork ------- rms_prgcreate | - * waitpid elan3_create | - * rms_prgaddcap | - * fork N procs ---+------ rms_setcap - * wait all | setup RMS_ env - * | setuid, etc. - * | exec mpi process - * | - * exit | - * rms_prgdestroy | - * exit | (one pair of processes per mpi proc!) - * - * - The first fork is required because rms_prgdestroy can't occur in the - * process that calls rms_prgcreate (since it is a member, ECHILD). - * - The second fork is required when running multiple processes per node - * because each process must announce its use of one of the hw contexts - * in the range allocated in the capability. - */ - -/* - * Process 1: issue the rms_prgdestroy for the job. - */ -int -qsw_prgdestroy(qsw_jobinfo_t jobinfo) -{ - if (rms_prgdestroy(jobinfo->j_prognum) < 0) { - /* translate errno values to more descriptive ones */ - switch (errno) { - case ECHILD: - slurm_seterrno(ECHILD_PRGDESTROY); - break; - case EEXIST: - slurm_seterrno(EEXIST_PRGDESTROY); - break; - default: - break; - } - return -1; - } - return 0; -} - -/* - * Process 2: Destroy the context after children are dead. - */ -void -qsw_prog_fini(qsw_jobinfo_t jobinfo) -{ - /* Do nothing... apparently this will be handled by - * callbacks in the kernel exit handlers ... - */ -#if 0 - if (jobinfo->j_ctx) { - elan3_control_close(jobinfo->j_ctx); - jobinfo->j_ctx = NULL; - } -#endif -} - -/* - * Process 2: Create the context and make capability available to children. - */ -int -qsw_prog_init(qsw_jobinfo_t jobinfo, uid_t uid) -{ - int err; - int i, nrails; -#if HAVE_LIBELANCTRL - nrails = elan_nrails(&jobinfo->j_cap); - - for (i = 0; i < nrails; i++) { - ELANCTRL_HANDLE handle; - /* - * Open up the Elan control device so we can create - * a new capability. - */ - if (elanctrl_open(&handle) != 0) { - slurm_seterrno(EELAN3CONTROL); - goto fail; - } - - /* Push capability into device driver */ - if (elanctrl_create_cap(handle, &jobinfo->j_cap) < 0) { - error("elanctrl_create_cap: %m"); - slurm_seterrno(EELAN3CREATE); - goto fail; - } - } - -#else /* !HAVE_LIBELANCTRL */ - nrails = elan3_nrails(&jobinfo->j_cap); - - for (i = 0; i < nrails; i++) { - - ELAN3_CTX *ctx; - - /* see qsw gnat sw-elan/4334: elan3_control_open can ret -1 */ - if ((ctx = elan3_control_open(i)) == NULL - || ctx == (void *)-1) { - slurm_seterrno(EELAN3CONTROL); - goto fail; - } - - - /* make cap known via rms_getcap/rms_ncaps to members - * of this prgnum */ - if (elan3_create(ctx, &jobinfo->j_cap) < 0) { - /* XXX masking errno value better than not knowing - * which function failed? */ - error("elan3_create(%d): %m", i); - slurm_seterrno(EELAN3CREATE); - goto fail; - } - } -#endif - /* associate this process and its children with prgnum */ - if (rms_prgcreate(jobinfo->j_prognum, uid, 1) < 0) { - /* translate errno values to more descriptive ones */ - switch (errno) { - case EINVAL: - slurm_seterrno(EINVAL_PRGCREATE); - break; - default: - break; - } - goto fail; - } - - if (rms_prgaddcap(jobinfo->j_prognum, 0, &jobinfo->j_cap) < 0) { - /* translate errno values to more descriptive ones */ - switch (errno) { - case ESRCH: - slurm_seterrno(ESRCH_PRGADDCAP); - break; - case EFAULT: - slurm_seterrno(EFAULT_PRGADDCAP); - break; - default: - break; - } - goto fail; - } - - /* note: _elan3_fini() destroys context and makes capability unavail */ - /* do it in qsw_prog_fini() after app terminates */ - return 0; -fail: - err = errno; /* presrve errno in case _elan3_fini touches it */ - qsw_prog_fini(jobinfo); - slurm_seterrno(err); - return -1; -} - -/* - * Process 3: Do the rms_setcap. - */ -int -qsw_setcap(qsw_jobinfo_t jobinfo, int procnum) -{ - /* - * Assign elan hardware context to current process. - * - arg1 (0 below) is an index into the kernel's list of caps for this - * program desc (added by rms_prgaddcap). There will be - * one per rail. - * - arg2 indexes the hw ctxt range in the capability - * [cap->LowContext, cap->HighContext] - */ - if (rms_setcap(0, procnum) < 0) { - /* translate errno values to more descriptive ones */ - switch (errno) { - case EINVAL: - slurm_seterrno(EINVAL_SETCAP); - break; - case EFAULT: - slurm_seterrno(EFAULT_SETCAP); - break; - default: - break; - } - return -1; - } - return 0; -} - - -/* - * Return the local elan address (for rail 0) or -1 on failure. - */ -int -qsw_getnodeid(void) -{ - int nodeid = -1; -#if HAVE_LIBELANCTRL - ELAN_DEV_IDX devidx = 0; - ELANCTRL_HANDLE handle; - ELAN_POSITION position; - - if (elanctrl_open(&handle) != 0) - slurm_seterrno_ret(EGETNODEID); - - if (elanctrl_get_position(handle, devidx, &position) != 0) - slurm_seterrno_ret(EGETNODEID); - - nodeid = position.pos_nodeid; - -#else - ELAN3_CTX *ctx = _elan3_init(0); /* rail 0 */ - if (ctx) { - nodeid = ctx->devinfo.Position.NodeId; - elan3_control_close(ctx); - } -#endif - if (nodeid == -1) - slurm_seterrno(EGETNODEID); - return nodeid; - -} - -static int -_read_elanhost_config (void) -{ - int rc; - - if (!(elanconf = elanhost_config_create ())) - return (-1); - - if ((rc = elanhost_config_read (elanconf, NULL)) < 0) { - error ("Unable to read Elan config: %s", - elanhost_config_err (elanconf)); - elanhost_config_destroy (elanconf); - elanconf = NULL; - return (-1); - } - - return (0); -} - -int -qsw_maxnodeid(void) -{ - int maxid = -1; - - _lock_qsw(); - if (!elanconf && (_read_elanhost_config() < 0)) - goto done; - - maxid = elanhost_config_maxid (elanconf); - - done: - _unlock_qsw(); - return maxid; -} - -/* - * Given a hostname, return the elanid or -1 on error. - * Initializes the elanconfig from the default /etc/elanhosts - * config file. - */ -int -qsw_getnodeid_byhost(char *host) -{ - int id = -1; - - if (host == NULL) - return (-1); - - _lock_qsw(); - if (!elanconf && (_read_elanhost_config() < 0)) - goto done; - - xassert (elanconf != NULL); - - id = elanhost_host2elanid (elanconf, host); - - done: - _unlock_qsw(); - return id; -} - -/* - * Given an elanid, determine the hostname. Returns -1 on error or the number - * of characters copied on success. - * XXX - assumes RMS style hostnames (see above) - */ -int -qsw_gethost_bynodeid(char *buf, int len, int id) -{ - int rc = -1; - char *hostp; - - if (id < 0) slurm_seterrno_ret(EGETHOST_BYNODEID); - - _lock_qsw(); - if (!elanconf && (_read_elanhost_config() < 0)) - goto done; - - if (!(hostp = elanhost_elanid2host (elanconf, ELANHOST_EIP, id))) { - slurm_seterrno (EGETHOST_BYNODEID); - goto done; - } - - rc = strlcpy (buf, hostp, len); - - done: - _unlock_qsw(); - return (rc); -} - -/* - * Send the specified signal to all members of a program description. - * Returns -1 on failure and sets errno. Ref: rms_prgsignal(3). - */ -int -qsw_prgsignal(qsw_jobinfo_t jobinfo, int signum) -{ - if (rms_prgsignal(jobinfo->j_prognum, signum) < 0) { - /* translate errno values to more descriptive ones */ - switch (errno) { - case EINVAL: - slurm_seterrno(EINVAL_PRGSIGNAL); - break; - case ESRCH: - slurm_seterrno(ESRCH_PRGSIGNAL); - break; - default: - break; - } - return -1; - } - return 0; -} - -#define _USE_ELAN3_CAPABILITY_STRING 1 - -#ifndef _USE_ELAN3_CAPABILITY_STRING -#define TRUNC_BITMAP 1 -static void -_print_capbitmap(FILE *fp, ELAN_CAPABILITY *cap) -{ - int bit_max = sizeof(cap->Bitmap)*8 - 1; - int bit; -#if TRUNC_BITMAP - bit_max = bit_max >= 64 ? 64 : bit_max; -#endif - for (bit = bit_max; bit >= 0; bit--) - fprintf(fp, "%c", BT_TEST(cap->Bitmap, bit) ? '1' : '0'); - fprintf(fp, "\n"); -} -#endif /* !_USE_ELAN3_CAPABILITY_STRING */ - -char * -qsw_capability_string(struct qsw_jobinfo *j, char *buf, size_t size) -{ - ELAN_CAPABILITY *cap; - - assert(buf != NULL); - assert(j->j_magic == QSW_JOBINFO_MAGIC); - - cap = &j->j_cap; - -#if HAVE_LIBELANCTRL - snprintf(buf, size, "prg=%d ctx=%x.%x nodes=%d.%d", - j->j_prognum, cap->LowContext, cap->HighContext, - cap->LowNode, cap->HighNode); -#else - snprintf(buf, size, "prg=%d ctx=%x.%x nodes=%d.%d entries=%d", - j->j_prognum, cap->LowContext, cap->HighContext, - cap->LowNode, cap->HighNode, - cap->Entries); -#endif - - return buf; -} - -void -qsw_print_jobinfo(FILE *fp, struct qsw_jobinfo *jobinfo) -{ - ELAN_CAPABILITY *cap; - char str[8192]; - - assert(jobinfo->j_magic == QSW_JOBINFO_MAGIC); - - fprintf(fp, "__________________\n"); - fprintf(fp, "prognum=%d\n", jobinfo->j_prognum); - - cap = &jobinfo->j_cap; - /* use elan3_capability_string as a shorter alternative for now */ -#if _USE_ELAN3_CAPABILITY_STRING -# if HAVE_LIBELANCTRL - fprintf(fp, "%s\n", elan_capability_string(cap, str)); -# else - fprintf(fp, "%s\n", elan3_capability_string(cap, str)); -# endif -#else - fprintf(fp, "cap.UserKey=%8.8x.%8.8x.%8.8x.%8.8x\n", - cap->UserKey.Values[0], cap->UserKey.Values[1], - cap->UserKey.Values[2], cap->UserKey.Values[3]); - /*fprintf(fp, "cap.Version=%d\n", cap->Version);*/ - fprintf(fp, "cap.Type=0x%hx\n", cap->Type); - fprintf(fp, "cap.LowContext=%d\n", cap->LowContext); - fprintf(fp, "cap.HighContext=%d\n", cap->HighContext); - fprintf(fp, "cap.MyContext=%d\n", cap->MyContext); - fprintf(fp, "cap.LowNode=%d\n", cap->LowNode); - fprintf(fp, "cap.HighNode=%d\n", cap->HighNode); -#if HAVE_LIBELAN3 - fprintf(fp, "cap.padding=%hd\n", cap->padding); - fprintf(fp, "cap.Entries=%d\n", cap->Entries); -#endif - fprintf(fp, "cap.Railmask=0x%x\n", cap->RailMask); - fprintf(fp, "cap.Bitmap="); - _print_capbitmap(fp, cap); -#endif - fprintf(fp, "\n------------------\n"); -} diff --git a/src/common/qsw.h b/src/common/qsw.h deleted file mode 100644 index 21f722d5839..00000000000 --- a/src/common/qsw.h +++ /dev/null @@ -1,100 +0,0 @@ -/*****************************************************************************\ - * qsw.h - Library routines for initiating jobs on QsNet. - ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Jim Garlick <garlick@llnl.gov> - * UCRL-CODE-2002-040. - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. -\*****************************************************************************/ - -#include <stdio.h> -#include <sys/types.h> - -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include "src/common/bitstring.h" -#include "src/common/pack.h" - -#ifndef _QSW_INCLUDED -#define _QSW_INCLUDED - -#if HAVE_LIBELANCTRL -# include <elan/capability.h> -#elif HAVE_LIBELAN3 -# include <elan3/elanvp.h> -#else -# error "Don't have either libelanctrl or libelan3!" -#endif - -/* opaque data structures - no peeking! */ -typedef struct qsw_libstate *qsw_libstate_t; -#ifndef __qsw_jobinfo_t_defined -# define __qsw_jobinfo_t_defined - typedef struct qsw_jobinfo *qsw_jobinfo_t; /* opaque data type */ -#endif - -#define QSW_LIBSTATE_PACK_MAX 12 -#define QSW_JOBINFO_PACK_MAX 120 -#define QSW_MAX_TASKS ELAN_MAX_VPS -#define QSW_PACK_SIZE (4 * (2+4+1+8+ELAN_BITMAPSIZE)) - -int qsw_alloc_libstate(qsw_libstate_t *lsp); -void qsw_free_libstate(qsw_libstate_t ls); - -int qsw_pack_libstate(qsw_libstate_t ls, Buf buffer); -int qsw_unpack_libstate(qsw_libstate_t ls, Buf buffer); - -int qsw_init(qsw_libstate_t restorestate); -void qsw_fini(qsw_libstate_t savestate); - -int qsw_alloc_jobinfo(qsw_jobinfo_t *jp); -qsw_jobinfo_t qsw_copy_jobinfo(qsw_jobinfo_t j); -void qsw_free_jobinfo(qsw_jobinfo_t j); - -int qsw_pack_jobinfo(qsw_jobinfo_t j, Buf buffer); -int qsw_unpack_jobinfo(qsw_jobinfo_t j, Buf buffer); - -int qsw_setup_jobinfo(qsw_jobinfo_t j, int nprocs, - bitstr_t *nodeset, int cyclic_alloc); - -int qsw_prog_init(qsw_jobinfo_t jobinfo, uid_t uid); -void qsw_prog_fini(qsw_jobinfo_t jobinfo); - -int qsw_prgdestroy(qsw_jobinfo_t jobinfo); /* was qsw_prog_reap */ - -int qsw_setcap(qsw_jobinfo_t jobinfo, int procnum); - /* was qsw_attach */ - -int qsw_prgsignal(qsw_jobinfo_t jobinfo, int signum); - /* was qsw_signal_job */ - - /* return max ElanID in configuration */ -int qsw_maxnodeid(void); - -int qsw_getnodeid(void); -int qsw_getnodeid_byhost(char *host); -int qsw_gethost_bynodeid(char *host, int len, int elanid); - -char * qsw_capability_string(qsw_jobinfo_t j, char *buf, size_t len); -void qsw_print_jobinfo(FILE *fp, struct qsw_jobinfo *jobinfo); - -#endif /* _QSW_INCLUDED */ diff --git a/src/slurmd/setenvpf.c b/src/common/setenvpf.c similarity index 98% rename from src/slurmd/setenvpf.c rename to src/common/setenvpf.c index de4ea20916d..ccbea5672a3 100644 --- a/src/slurmd/setenvpf.c +++ b/src/common/setenvpf.c @@ -1,5 +1,5 @@ /*****************************************************************************\ - * src/slurmd/setenvpf.c - add an environment variable to environment vector + * src/common/setenvpf.c - add an environment variable to environment vector * $Id$ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. diff --git a/src/slurmd/setenvpf.h b/src/common/setenvpf.h similarity index 96% rename from src/slurmd/setenvpf.h rename to src/common/setenvpf.h index c8667d936d7..8a9060a30c3 100644 --- a/src/slurmd/setenvpf.h +++ b/src/common/setenvpf.h @@ -1,5 +1,5 @@ /*****************************************************************************\ - * src/slurmd/setenvpf.h - environment vector manipulation + * src/common/setenvpf.h - environment vector manipulation ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 7f5a3e82e36..0b61fcee0fe 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -39,6 +39,7 @@ #include "src/common/log.h" #include "src/common/slurm_cred.h" #include "src/common/slurm_protocol_defs.h" +#include "src/common/switch.h" #include "src/common/xmalloc.h" static void _free_all_job_info (job_info_msg_t *msg); @@ -282,9 +283,8 @@ void slurm_free_launch_tasks_request_msg(launch_tasks_request_msg_t * msg) xfree(msg->ofname); xfree(msg->ofname); -# ifdef HAVE_ELAN - qsw_free_jobinfo(msg->qsw_job); -# endif + if (msg->switch_job) + switch_free_jobinfo(msg->switch_job); xfree(msg); } @@ -473,10 +473,8 @@ void slurm_free_resource_allocation_and_run_response_msg ( xfree(msg->cpu_count_reps); xfree(msg->node_addr); slurm_cred_destroy(msg->cred); -# ifdef HAVE_LIBELAN3 - if (msg->qsw_job) - qsw_free_jobinfo(msg->qsw_job); -# endif + if (msg->switch_job) + switch_free_jobinfo(msg->switch_job); xfree(msg); } } @@ -494,10 +492,8 @@ void slurm_free_job_step_create_response_msg( if (msg) { slurm_cred_destroy(msg->cred); -# ifdef HAVE_LIBELAN3 - if (msg->qsw_job) - qsw_free_jobinfo(msg->qsw_job); -# endif + if (msg->switch_job) + switch_free_jobinfo(msg->switch_job); xfree(msg); } @@ -546,7 +542,6 @@ void slurm_free_ctl_conf(slurm_ctl_conf_info_msg_t * config_ptr) xfree(config_ptr->slurmd_spooldir); xfree(config_ptr->slurm_conf); xfree(config_ptr->state_save_location); - xfree(config_ptr->switch_type); xfree(config_ptr->tmp_fs); xfree(config_ptr); } diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index b4ae235cb5a..2b47baec4bc 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -36,9 +36,6 @@ # include <stdint.h> # endif # endif /* HAVE_INTTYPES_H */ -# if HAVE_ELAN -# include <src/common/qsw.h> -# endif #else /* !HAVE_CONFIG_H */ # include <inttypes.h> #endif /* HAVE_CONFIG_H */ @@ -47,6 +44,7 @@ #include "src/common/macros.h" #include "src/common/slurm_protocol_common.h" +#include "src/common/switch.h" #include "src/common/xassert.h" @@ -258,10 +256,7 @@ typedef struct launch_tasks_request_msg { int32_t slurmd_debug; /* remote slurmd debug level */ slurm_cred_t cred; /* job credential */ - -#ifdef HAVE_ELAN - qsw_jobinfo_t qsw_job; /* Elan3 switch context */ -#endif + switch_jobinfo_t switch_job; /* switch credential for the job */ } launch_tasks_request_msg_t; typedef struct launch_tasks_response_msg { diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 4cd95d0e354..468174eb1f6 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -28,9 +28,9 @@ # include "config.h" #endif +#include <errno.h> #include <stdio.h> #include <stdlib.h> -#include <errno.h> #include <string.h> #include "src/common/bitstring.h" @@ -41,12 +41,9 @@ #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_protocol_defs.h" #include "src/common/slurm_protocol_pack.h" +#include "src/common/switch.h" #include "src/common/xmalloc.h" -#if HAVE_ELAN -# include "src/common/qsw.h" -#endif - #define _pack_job_info_msg(msg,buf) _pack_buffer_msg(msg,buf) #define _pack_job_step_info_msg(msg,buf) _pack_buffer_msg(msg,buf) @@ -890,9 +887,7 @@ static void _pack_slurm_addr_array(msg->node_addr, msg->node_cnt, buffer); slurm_cred_pack(msg->cred, buffer); -#ifdef HAVE_ELAN - qsw_pack_jobinfo(msg->qsw_job, buffer); -#endif + switch_pack_jobinfo(msg->switch_job, buffer); } static int @@ -941,14 +936,12 @@ static int if (!(tmp_ptr->cred = slurm_cred_unpack(buffer))) goto unpack_error; -#ifdef HAVE_ELAN - qsw_alloc_jobinfo(&tmp_ptr->qsw_job); - if (qsw_unpack_jobinfo(tmp_ptr->qsw_job, buffer) < 0) { - error("qsw_unpack_jobinfo: %m"); - qsw_free_jobinfo(tmp_ptr->qsw_job); + switch_alloc_jobinfo(&tmp_ptr->switch_job); + if (switch_unpack_jobinfo(tmp_ptr->switch_job, buffer) < 0) { + error("switch_unpack_jobinfo: %m"); + switch_free_jobinfo(tmp_ptr->switch_job); goto unpack_error; } -#endif return SLURM_SUCCESS; unpack_error: @@ -1286,9 +1279,7 @@ _pack_job_step_create_response_msg(job_step_create_response_msg_t * msg, pack32(msg->job_step_id, buffer); packstr(msg->node_list, buffer); slurm_cred_pack(msg->cred, buffer); -#ifdef HAVE_ELAN - qsw_pack_jobinfo(msg->qsw_job, buffer); -#endif + switch_pack_jobinfo(msg->switch_job, buffer); } @@ -1309,14 +1300,12 @@ _unpack_job_step_create_response_msg(job_step_create_response_msg_t ** msg, if (!(tmp_ptr->cred = slurm_cred_unpack(buffer))) goto unpack_error; -#ifdef HAVE_ELAN - qsw_alloc_jobinfo(&tmp_ptr->qsw_job); - if (qsw_unpack_jobinfo(tmp_ptr->qsw_job, buffer)) { - error("qsw_unpack_jobinfo: %m"); - qsw_free_jobinfo(tmp_ptr->qsw_job); + switch_alloc_jobinfo(&tmp_ptr->switch_job); + if (switch_unpack_jobinfo(tmp_ptr->switch_job, buffer)) { + error("switch_unpack_jobinfo: %m"); + switch_free_jobinfo(tmp_ptr->switch_job); goto unpack_error; } -#endif return SLURM_SUCCESS; unpack_error: @@ -2139,9 +2128,7 @@ _pack_launch_tasks_request_msg(launch_tasks_request_msg_t * msg, Buf buffer) pack32(msg->slurmd_debug, buffer); pack32_array(msg->global_task_ids, msg->tasks_to_launch, buffer); -#ifdef HAVE_ELAN - qsw_pack_jobinfo(msg->qsw_job, buffer); -#endif + switch_pack_jobinfo(msg->switch_job, buffer); } static int @@ -2179,13 +2166,13 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t ** if (msg->tasks_to_launch != uint32_tmp) goto unpack_error; -#ifdef HAVE_ELAN - qsw_alloc_jobinfo(&msg->qsw_job); - if (qsw_unpack_jobinfo(msg->qsw_job, buffer) < 0) { - error("qsw_unpack_jobinfo: %m"); + switch_alloc_jobinfo(&msg->switch_job); + if (switch_unpack_jobinfo(msg->switch_job, buffer) < 0) { + error("switch_unpack_jobinfo: %m"); + switch_free_jobinfo(msg->switch_job); goto unpack_error; } -#endif + return SLURM_SUCCESS; unpack_error: diff --git a/src/common/switch.c b/src/common/switch.c new file mode 100644 index 00000000000..f2499251d71 --- /dev/null +++ b/src/common/switch.c @@ -0,0 +1,390 @@ +/*****************************************************************************\ + * src/common/switch.c - Generic switch (interconnect) for slurm + ***************************************************************************** + * Copyright (C) 2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Moe Jette <jette@llnl.gov>. + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "src/common/macros.h" +#include "src/common/plugin.h" +#include "src/common/plugrack.h" +#include "src/common/slurm_protocol_api.h" +#include "src/common/switch.h" +#include "src/common/xmalloc.h" +#include "src/common/xstring.h" + +/* + * WARNING: Do not change the order of these fields or add additional + * fields at the beginning of the structure. If you do, job completion + * logging plugins will stop working. If you need to add fields, add them + * at the end of the structure. + */ +typedef struct slurm_switch_ops { + int (*state_save) ( char *dir_name ); + int (*state_restore) ( char *dir_name ); + bool (*no_frag) ( void ); + int (*alloc_jobinfo) ( switch_jobinfo_t *jobinfo ); + int (*build_jobinfo) ( switch_jobinfo_t jobinfo, + char *nodelist, int nprocs, + int cyclic_alloc); + switch_jobinfo_t (*copy_jobinfo) ( switch_jobinfo_t jobinfo ); + void (*free_jobinfo) ( switch_jobinfo_t jobinfo ); + int (*pack_jobinfo) ( switch_jobinfo_t jobinfo, + Buf buffer ); + int (*unpack_jobinfo) ( switch_jobinfo_t jobinfo, + Buf buffer ); + void (*print_jobinfo) ( FILE *fp, + switch_jobinfo_t jobinfo ); + char * (*string_jobinfo) ( switch_jobinfo_t jobinfo, + char *buf, size_t size); + int (*node_init) ( void ); + int (*node_fini) ( void ); + int (*job_preinit) ( switch_jobinfo_t jobinfo ); + int (*job_init) ( switch_jobinfo_t jobinfo, + uid_t uid ); + int (*job_fini) ( switch_jobinfo_t jobinfo ); + int (*job_postfini) ( switch_jobinfo_t jobinfo, + uid_t pgid, + uint32_t job_id, + uint32_t step_id ); + int (*job_attach) ( switch_jobinfo_t jobinfo, + char ***env, int nodeid, + int procid, int nnodes, + int nprocs, gid_t gid); +} slurm_switch_ops_t; + +struct slurm_switch_context { + char * switch_type; + plugrack_t plugin_list; + plugin_handle_t cur_plugin; + int switch_errno; + slurm_switch_ops_t ops; +}; + +static slurm_switch_context_t g_context = NULL; +static pthread_mutex_t context_lock = PTHREAD_MUTEX_INITIALIZER; + +static slurm_switch_context_t +_slurm_switch_context_create( const char *switch_type) +{ + slurm_switch_context_t c; + + if ( switch_type == NULL ) { + debug3( "_slurm_switch_context_create: no switch type" ); + return NULL; + } + + c = xmalloc( sizeof( struct slurm_switch_context ) ); + + c->switch_errno = SLURM_SUCCESS; + + /* Copy the job completion authentication type. */ + c->switch_type = xstrdup( switch_type ); + if (c->switch_type == NULL ) { + debug3( "can't make local copy of switch type" ); + xfree( c ); + return NULL; + } + + /* Plugin rack is demand-loaded on first reference. */ + c->plugin_list = NULL; + c->cur_plugin = PLUGIN_INVALID_HANDLE; + + return c; +} + +static int +_slurm_switch_context_destroy( slurm_switch_context_t c ) +{ + /* + * Must check return code here because plugins might still + * be loaded and active. + */ + if ( c->plugin_list ) { + if ( plugrack_destroy( c->plugin_list ) != SLURM_SUCCESS ) { + return SLURM_ERROR; + } + } + + xfree( c->switch_type ); + xfree( c ); + + return SLURM_SUCCESS; +} + +/* + * Resolve the operations from the plugin. + */ +static slurm_switch_ops_t * +_slurm_switch_get_ops( slurm_switch_context_t c ) +{ + /* + * These strings must be kept in the same order as the fields + * declared for slurm_switch_ops_t. + */ + static const char *syms[] = { + "switch_p_libstate_save", + "switch_p_libstate_restore", + "switch_p_no_frag", + "switch_p_alloc_jobinfo", + "switch_p_build_jobinfo", + "switch_p_copy_jobinfo", + "switch_p_free_jobinfo", + "switch_p_pack_jobinfo", + "switch_p_unpack_jobinfo", + "switch_p_print_jobinfo", + "switch_p_sprint_jobinfo", + "switch_p_node_init", + "switch_p_node_fini", + "switch_p_job_preinit", + "switch_p_job_init", + "switch_p_job_fini", + "switch_p_job_postfini", + "switch_p_job_attach" + }; + int n_syms = sizeof( syms ) / sizeof( char * ); + + /* Get the plugin list, if needed. */ + if ( c->plugin_list == NULL ) { + char *plugin_dir; + c->plugin_list = plugrack_create(); + if ( c->plugin_list == NULL ) { + verbose( "Unable to create a plugin manager" ); + return NULL; + } + + plugrack_set_major_type( c->plugin_list, "switch" ); + plugrack_set_paranoia( c->plugin_list, + PLUGRACK_PARANOIA_NONE, + 0 ); + plugin_dir = slurm_get_plugin_dir(); + plugrack_read_dir( c->plugin_list, plugin_dir ); + xfree(plugin_dir); + } + + /* Find the correct plugin. */ + c->cur_plugin = + plugrack_use_by_type( c->plugin_list, c->switch_type ); + if ( c->cur_plugin == PLUGIN_INVALID_HANDLE ) { + verbose( "can't find a plugin for type %s", c->switch_type ); + return NULL; + } + + /* Dereference the API. */ + if ( plugin_get_syms( c->cur_plugin, + n_syms, + syms, + (void **) &c->ops ) < n_syms ) { + verbose( "incomplete switch plugin detected" ); + return NULL; + } + + return &c->ops; +} + +extern int switch_init( void ) +{ + int retval = SLURM_SUCCESS; + char *switch_type = NULL; + + slurm_mutex_lock( &context_lock ); + + if ( g_context ) + goto done; + + switch_type = slurm_get_switch_type(); + g_context = _slurm_switch_context_create( switch_type ); + if ( g_context == NULL ) { + error( "cannot create a context for %s", switch_type ); + retval = SLURM_ERROR; + goto done; + } + + if ( _slurm_switch_get_ops( g_context ) == NULL ) { + error( "cannot resolve plugin operations for %s", switch_type ); + _slurm_switch_context_destroy( g_context ); + g_context = NULL; + retval = SLURM_ERROR; + } + + done: + slurm_mutex_unlock( &context_lock ); + xfree(switch_type); + return retval; +} + +extern int switch_save(char *dir_name) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.state_save))( dir_name ); +} + +extern int switch_restore(char *dir_name) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.state_restore))( dir_name ); +} + +extern bool switch_no_frag(void) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.no_frag))( ); +} + +extern int switch_alloc_jobinfo(switch_jobinfo_t *jobinfo) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.alloc_jobinfo))( jobinfo ); +} + +extern int switch_build_jobinfo(switch_jobinfo_t jobinfo, + char *nodelist, int nprocs, int cyclic_alloc) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.build_jobinfo))( jobinfo, nodelist, + nprocs, cyclic_alloc ); +} + +extern switch_jobinfo_t switch_copy_jobinfo(switch_jobinfo_t jobinfo) +{ + if ( switch_init() < 0 ) + return NULL; + + return (*(g_context->ops.copy_jobinfo))( jobinfo ); +} + +extern void switch_free_jobinfo(switch_jobinfo_t jobinfo) +{ + if ( switch_init() < 0 ) + return; + + (*(g_context->ops.free_jobinfo))( jobinfo ); +} + +extern int switch_pack_jobinfo(switch_jobinfo_t jobinfo, Buf buffer) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.pack_jobinfo))( jobinfo, buffer ); +} + +extern int switch_unpack_jobinfo(switch_jobinfo_t jobinfo, Buf buffer) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.unpack_jobinfo))( jobinfo, buffer ); +} + +extern void switch_print_jobinfo(FILE *fp, switch_jobinfo_t jobinfo) +{ + if ( switch_init() < 0 ) + return; + + (*(g_context->ops.print_jobinfo)) (fp, jobinfo); +} + +extern char *switch_sprint_jobinfo( switch_jobinfo_t jobinfo, + char *buf, size_t size) +{ + if ( switch_init() < 0 ) + return NULL; + + return (*(g_context->ops.string_jobinfo)) (jobinfo, buf, size); +} + +extern int interconnect_node_init(void) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.node_init)) (); +} + +extern int interconnect_node_fini(void) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.node_fini)) (); +} + +extern int interconnect_preinit(switch_jobinfo_t jobinfo) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.job_preinit)) (jobinfo); +} + +extern int interconnect_init(switch_jobinfo_t jobinfo, uid_t uid) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.job_init)) (jobinfo, uid); +} + +extern int interconnect_fini(switch_jobinfo_t jobinfo) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.job_fini)) (jobinfo); +} + +extern int interconnect_postfini(switch_jobinfo_t jobinfo, uid_t pgid, + uint32_t job_id, uint32_t step_id ) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.job_postfini)) (jobinfo, pgid, + job_id, step_id); +} + +extern int interconnect_attach(switch_jobinfo_t jobinfo, char ***env, + int nodeid, int procid, int nnodes, int nprocs, + gid_t gid) +{ + if ( switch_init() < 0 ) + return SLURM_ERROR; + + return (*(g_context->ops.job_attach)) (jobinfo, env, + nodeid, procid, nnodes, nprocs, gid); +} diff --git a/src/common/switch.h b/src/common/switch.h new file mode 100644 index 00000000000..b70047402cd --- /dev/null +++ b/src/common/switch.h @@ -0,0 +1,228 @@ +/*****************************************************************************\ + * src/common/switch.h - Generic switch (interconnect) info for slurm + ***************************************************************************** + * Copyright (C) 2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Moe Jette <jette@llnl.gov>. + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#ifndef _SWITCH_H +#define _SWITCH_H 1 + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include <stdio.h> +#include <sys/types.h> +#include <unistd.h> + +#include "src/common/macros.h" +#include "src/common/pack.h" + +/* opaque data structures - no peeking! */ +#ifndef __switch_jobinfo_t_defined +# define __switch_jobinfo_t_defined + typedef struct switch_jobinfo *switch_jobinfo_t; +#endif +typedef struct slurm_switch_context * slurm_switch_context_t; + +/*****************************************\ + * GLOBAL SWITCH STATE MANGEMENT FUNCIONS* +\ *****************************************/ + +/* initialize the switch plugin */ +extern int switch_init (void); + +/* save any global switch state to a file within the specified directory + * the actual file name used in plugin specific + * IN dir_name - directory into which switch state is saved + * RET - slurm error code + */ +extern int switch_save (char *dir_name); + +/* restore any global switch state from a file within the specified directory + * the actual file name used in plugin specific + * IN dir_name - directory from hich switch state is restored or NULL for + * switch restart with no state restored + * RET - slurm error code + */ +extern int switch_restore(char *dir_name); + +/* report if resource fragmentation is important. if so, delay scheduling a + * new job while another is in the process of terminating. + * RET - true if fragmentation is important + */ +extern bool switch_no_frag(void); + +/******************************************************\ + * JOB-SPECIFIC SWITCH CREDENTIAL MANAGEMENT FUNCIONS * +\******************************************************/ + +/* allocate storage for a switch job credential + * OUT jobinfo - storage for a switch job credential + * RET - slurm error code + * NOTE: storage must be freed using g_switch_free_jobinfo + */ +extern int switch_alloc_jobinfo (switch_jobinfo_t *jobinfo); + +/* fill a job's switch credential + * OUT jobinfo - storage for a switch job credential + * IN nodelist - list of nodes to be used by the job + * IN nprocs - count of tasks in the job + * IN cyclic_alloc - task distribution pattern, 1=cyclic, 0=block + * NOTE: storage must be freed using g_switch_free_jobinfo + */ +extern int switch_build_jobinfo (switch_jobinfo_t jobinfo, + char *nodelist, int nprocs, int cyclic_alloc); + +/* copy a switch job credential + * IN jobinfo - the switch job credential to be copied + * RET - the copy + * NOTE: returned value must be freed using g_switch_free_jobinfo + */ +extern switch_jobinfo_t switch_copy_jobinfo(switch_jobinfo_t jobinfo); + +/* free storage previously allocated for a switch job credential + * IN jobinfo - the switch job credential to be freed + */ +extern void switch_free_jobinfo (switch_jobinfo_t jobinfo); + +/* pack a switch job credential into a buffer in machine independent form + * IN jobinfo - the switch job credential to be saved + * OUT buffer - buffer with switch credential appended + * RET - slurm error code + */ +extern int switch_pack_jobinfo (switch_jobinfo_t jobinfo, Buf buffer); + +/* unpack a switch job credential from a buffer + * OUT jobinfo - the switch job credential read + * IN buffer - buffer with switch credential read from current pointer loc + * RET - slurm error code + * NOTE: returned value must be freed using g_switch_free_jobinfo + */ +extern int switch_unpack_jobinfo(switch_jobinfo_t jobinfo, Buf buffer); + +/* write job credential string representation to a file + * IN fp - an open file pointer + * IN jobinfo - a switch job credential + */ +extern void switch_print_jobinfo(FILE *fp, switch_jobinfo_t jobinfo); + +/* write job credential to a string + * IN jobinfo - a switch job credential + * OUT buf - location to write job credential contents + * IN size - byte size of buf + * RET - the string, same as buf + */ +extern char *switch_sprint_jobinfo( switch_jobinfo_t jobinfo, + char *buf, size_t size); +/********************************************************************\ + * JOB LAUNCH AND MANAGEMENT FUNCTIONS RELATED TO SWITCH CREDENTIAL * +\********************************************************************/ + +/* + * Setup node for interconnect use. + * + * This function is run from the top level slurmd only once per + * slurmd run. It may be used, for instance, to perform some one-time + * interconnect setup or spawn an error handling thread. + * + */ +extern int interconnect_node_init(void); + +/* + * Finalize interconnect on node. + * + * This function is called once as slurmd exits (slurmd will wait for + * this function to return before continuing the exit process) + */ +extern int interconnect_node_fini(void); + + +/* + * Notes on job related interconnect functions: + * + * Interconnect functions are run within slurmd in the following way: + * (Diagram courtesy of Jim Garlick [see qsw.c] ) + * + * Process 1 (root) Process 2 (root, user) | Process 3 (user task) + * | + * interconnect_preinit | + * fork ------------------ interconnect_init | + * waitpid setuid, chdir, etc. | + * fork N procs -----------+--- interconnect_attach + * wait all | exec mpi process + * interconnect_fini* | + * interconnect_postfini | + * | + * + * [ *Note: interconnect_fini() is run as the uid of the job owner, not root ] + */ + +/* + * Prepare node for job. + * + * pre is run as root in the first slurmd process, the so called job + * manager. This function can be used to perform any initialization + * that needs to be performed in the same process as interconnect_fini() + * + */ +extern int interconnect_preinit(switch_jobinfo_t jobinfo); + +/* + * initialize interconnect on node for job. This function is run from the + * 2nd slurmd process (some interconnect implementations may require + * interconnect init functions to be executed from a separate process + * than the process executing interconnect_fini() [e.g. QsNet]) + * + */ +extern int interconnect_init(switch_jobinfo_t jobinfo, uid_t uid); + +/* + * This function is run from the same process as interconnect_init() + * after all job tasks have exited. It is *not* run as root, because + * the process in question has already setuid to the job owner. + * + */ +extern int interconnect_fini(switch_jobinfo_t jobinfo); + +/* + * Finalize interconnect on node. + * + * This function is run from the initial slurmd process (same process + * as interconnect_preinit()), and is run as root. Any cleanup routines + * that need to be run with root privileges should be run from this + * function. + */ +extern int interconnect_postfini(switch_jobinfo_t jobinfo, uid_t pgid, + uint32_t job_id, uint32_t step_id ); + +/* + * attach process to interconnect + * (Called from within the process, so it is appropriate to set + * interconnect specific environment variables here) + */ +extern int interconnect_attach(switch_jobinfo_t jobinfo, char ***env, + int nodeid, int procid, int nnodes, int nprocs, + gid_t gid); + +#endif /* _SWITCH_H */ diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am index 6841e92f357..b1f24f9f6b0 100644 --- a/src/plugins/Makefile.am +++ b/src/plugins/Makefile.am @@ -1,3 +1,3 @@ # $Id$ -SUBDIRS = auth jobcomp sched +SUBDIRS = auth jobcomp sched switch diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index df07f580ea6..bb624ffebea 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -56,13 +56,10 @@ #include "src/common/slurm_auth.h" #include "src/common/slurm_jobcomp.h" #include "src/common/slurm_protocol_api.h" +#include "src/common/switch.h" #include "src/common/xsignal.h" #include "src/common/xstring.h" -#if HAVE_ELAN -# include "src/common/qsw.h" -#endif - #include "src/slurmctld/agent.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/ping_nodes.h" diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 58cc688244d..33dbf7329a7 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -41,15 +41,12 @@ #include <string.h> #include <sys/stat.h> -#ifdef HAVE_ELAN -# include "src/common/qsw.h" -#endif - #include <slurm/slurm_errno.h> #include "src/common/bitstring.h" #include "src/common/hostlist.h" #include "src/common/slurm_jobcomp.h" +#include "src/common/switch.h" #include "src/common/xassert.h" #include "src/common/xstring.h" #include "src/slurmctld/agent.h" @@ -707,9 +704,7 @@ static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer) pack_time(step_ptr->start_time, buffer); packstr(step_ptr->host, buffer); packstr(step_ptr->step_node_list, buffer); -#ifdef HAVE_ELAN - qsw_pack_jobinfo(step_ptr->qsw_job, buffer); -#endif + switch_pack_jobinfo(step_ptr->switch_job, buffer); } /* Unpack job step state information from a buffer */ @@ -756,13 +751,11 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer) step_ptr->step_node_list = step_node_list; step_node_list = NULL; /* re-used, nothing left to free */ step_ptr->time_last_active = time(NULL); -#ifdef HAVE_ELAN - qsw_alloc_jobinfo(&step_ptr->qsw_job); - if (qsw_unpack_jobinfo(step_ptr->qsw_job, buffer)) { - qsw_free_jobinfo(step_ptr->qsw_job); + switch_alloc_jobinfo(&step_ptr->switch_job); + if (switch_unpack_jobinfo(step_ptr->switch_job, buffer)) { + switch_free_jobinfo(step_ptr->switch_job); goto unpack_error; } -#endif info("recovered job step %u.%u", job_ptr->job_id, step_id); return SLURM_SUCCESS; @@ -1174,11 +1167,9 @@ int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id, new_job_id); top_prio = _top_priority(job_ptr); -#ifdef HAVE_ELAN /* Avoid resource fragmentation if important */ - if (top_prio && job_is_completing()) + if (switch_no_frag() && top_prio && job_is_completing()) top_prio = false; /* Don't scheduled job right now */ -#endif if (immediate && (!top_prio)) { job_ptr->job_state = JOB_FAILED; job_ptr->start_time = 0; diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 3ffbef73ca1..d794b02b057 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -146,13 +146,11 @@ int schedule(void) { NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; lock_slurmctld(job_write_lock); -#ifdef HAVE_ELAN /* Avoid resource fragmentation if important */ - if (job_is_completing()) { + if (switch_no_frag() && job_is_completing()) { unlock_slurmctld(job_write_lock); return SLURM_SUCCESS; } -#endif debug("Running job scheduler"); job_queue_size = _build_job_queue(&job_queue); if (job_queue_size == 0) { diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 7922451a786..c0c5c6e4ac9 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1155,16 +1155,15 @@ validate_node_specs (char *node_name, uint32_t cpus, node_ptr->cpus = cpus; node_ptr->real_memory = real_memory; node_ptr->tmp_disk = tmp_disk; -#ifdef HAVE_ELAN /* Every node in a given partition must have the same * processor count at present */ - if ((slurmctld_conf.fast_schedule == 0) && - (node_ptr->config_ptr->cpus != cpus)) { + if ((slurmctld_conf.fast_schedule == 0) && + (node_ptr->config_ptr->cpus != cpus) && + (strcmp(slurmctld_conf.switch_type, "switch/elan") == 0)) { error ("Node %s processor count inconsistent with rest " "of partition", node_name); return EINVAL; /* leave node down */ } -#endif if (node_ptr->node_state == NODE_STATE_UNKNOWN) { last_node_update = time (NULL); reset_job_priority(); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 8c8330715ec..707c9d9a185 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -50,12 +50,9 @@ #include "src/common/slurm_auth.h" #include "src/common/slurm_cred.h" #include "src/common/slurm_protocol_api.h" +#include "src/common/switch.h" #include "src/common/xstring.h" -#if HAVE_ELAN -# include "src/common/qsw.h" -#endif - #include "src/slurmctld/locks.h" #include "src/slurmctld/proc_req.h" #include "src/slurmctld/read_config.h" @@ -275,8 +272,8 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->slurmd_spooldir = slurmctld_conf.slurmd_spooldir; conf_ptr->slurmd_timeout = slurmctld_conf.slurmd_timeout; conf_ptr->slurm_conf = slurmctld_conf.slurm_conf; - conf_ptr->switch_type = slurmctld_conf.switch_type; conf_ptr->state_save_location = slurmctld_conf.state_save_location; + conf_ptr->switch_type = slurmctld_conf.switch_type; conf_ptr->tmp_fs = slurmctld_conf.tmp_fs; conf_ptr->wait_time = slurmctld_conf.wait_time; return; @@ -510,9 +507,8 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg) alloc_msg.node_cnt = node_cnt; alloc_msg.node_addr = node_addr; alloc_msg.cred = slurm_cred; -#ifdef HAVE_ELAN - alloc_msg.qsw_job = qsw_copy_jobinfo(step_rec->qsw_job); -#endif + alloc_msg.switch_job = switch_copy_jobinfo( + step_rec->switch_job); unlock_slurmctld(job_write_lock); response_msg.msg_type = RESPONSE_ALLOCATION_AND_RUN_JOB_STEP; response_msg.data = &alloc_msg; @@ -520,9 +516,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg) if (slurm_send_node_msg(msg->conn_fd, &response_msg) < 0) _kill_job_on_msg_fail(job_id); slurm_cred_destroy(slurm_cred); -#ifdef HAVE_ELAN - qsw_free_jobinfo(alloc_msg.qsw_job); -#endif + switch_free_jobinfo(alloc_msg.switch_job); (void) dump_all_job_state(); /* Has its own locks */ } } @@ -952,10 +946,8 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) job_step_resp.job_step_id = step_rec->step_id; job_step_resp.node_list = xstrdup(step_rec->step_node_list); job_step_resp.cred = slurm_cred; - -#ifdef HAVE_ELAN - job_step_resp.qsw_job = qsw_copy_jobinfo(step_rec->qsw_job); -#endif + job_step_resp.switch_job = switch_copy_jobinfo( + step_rec->switch_job); unlock_slurmctld(job_write_lock); resp.address = msg->address; resp.msg_type = RESPONSE_JOB_STEP_CREATE; @@ -964,9 +956,7 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) slurm_send_node_msg(msg->conn_fd, &resp); xfree(job_step_resp.node_list); slurm_cred_destroy(slurm_cred); -#ifdef HAVE_ELAN - qsw_free_jobinfo(job_step_resp.qsw_job); -#endif + switch_free_jobinfo(job_step_resp.switch_job); (void) dump_all_job_state(); /* Sets own locks */ } } @@ -1353,8 +1343,8 @@ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg) /* save_all_state(); performed by _slurmctld_background */ } slurm_send_rc_msg(msg, error_code); - if ((error_code == SLURM_SUCCESS) && core_arg && - (slurmctld_config.thread_id_sig)) + if ((error_code == SLURM_SUCCESS) && core_arg && + (slurmctld_config.thread_id_sig)) pthread_kill(slurmctld_config.thread_id_sig, SIGABRT); } diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 0c6f8b3393b..1092a415fca 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -41,11 +41,12 @@ #include <unistd.h> #include "src/common/hostlist.h" -#include "src/common/slurm_jobcomp.h" #include "src/common/list.h" #include "src/common/macros.h" #include "src/common/parse_spec.h" #include "src/common/read_config.h" +#include "src/common/slurm_jobcomp.h" +#include "src/common/switch.h" #include "src/common/xstring.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/proc_req.h" @@ -273,7 +274,7 @@ static int _init_all_slurm_conf(void) static int _parse_node_spec(char *in_line) { char *node_addr = NULL, *node_name = NULL, *state = NULL; - char *feature = NULL, *reason = NULL; + char *feature = NULL, *reason = NULL; char *this_node_addr, *this_node_name; int error_code, first, i; int state_val, cpus_val, real_memory_val, tmp_disk_val, weight_val; @@ -767,6 +768,7 @@ int read_slurm_conf(int recover) validate_config(&slurmctld_conf); update_logging(); g_slurm_jobcomp_init(slurmctld_conf.job_comp_loc); + switch_init(); if (default_part_loc == NULL) error("read_slurm_conf: default partition not set."); @@ -1019,61 +1021,10 @@ static void _validate_node_proc_count(void) */ int switch_state_begin(int recover) { - int error_code = SLURM_SUCCESS; -#ifdef HAVE_ELAN - qsw_libstate_t old_state = NULL; - Buf buffer = NULL; - char *qsw_state_file = NULL, *data = NULL; - int state_fd, data_allocated, data_read= 0, data_size = 0; - - if (recover) { - /* Read state from file into buffer */ - qsw_state_file = xstrdup (slurmctld_conf.state_save_location); - xstrcat (qsw_state_file, "/qsw_state"); - state_fd = open (qsw_state_file, O_RDONLY); - if (state_fd >= 0) { - data_allocated = BUF_SIZE; - data = xmalloc(data_allocated); - while ((data_read = - read (state_fd, &data[data_size], - BUF_SIZE)) == BUF_SIZE) { - data_size += data_read; - data_allocated += BUF_SIZE; - xrealloc(data, data_allocated); - } - data_size += data_read; - if (data_read < 0) { - error ("Read error on %s, %m", qsw_state_file); - error_code = SLURM_ERROR; - data_size = 0; - } - close (state_fd); - } else - info("No %s file to recover QSW state from", - qsw_state_file); - xfree(qsw_state_file); - - if ((error_code == SLURM_SUCCESS) && data_size) { - if (qsw_alloc_libstate(&old_state)) { - error_code = SLURM_ERROR; - } else { - buffer = create_buf (data, data_size); - if (qsw_unpack_libstate(old_state, buffer) < 0) - error_code = errno; - } - } - if (buffer) - free_buf(buffer); - else if (data) - xfree(data); - - } - if (error_code == SLURM_SUCCESS) - error_code = qsw_init(old_state); - if (old_state) - qsw_free_libstate(old_state); -#endif /* HAVE_ELAN */ - return error_code; + if (recover) + return switch_restore(slurmctld_conf.state_save_location); + else + return switch_restore(NULL); } /* @@ -1082,42 +1033,6 @@ int switch_state_begin(int recover) */ int switch_state_fini(void) { - int error_code = SLURM_SUCCESS; -#ifdef HAVE_ELAN - qsw_libstate_t old_state = NULL; - Buf buffer = NULL; - char *qsw_state_file = NULL; - int state_fd; - - if (qsw_alloc_libstate(&old_state)) - return errno; - qsw_fini(old_state); - buffer = init_buf(1024); - error_code = qsw_pack_libstate(old_state, buffer); - qsw_state_file = xstrdup (slurmctld_conf.state_save_location); - xstrcat (qsw_state_file, "/qsw_state"); - (void) unlink (qsw_state_file); - state_fd = creat (qsw_state_file, 0600); - if (state_fd == 0) { - error ("Can't save state, error creating file %s %m", - qsw_state_file); - error_code = errno; - } - else { - if (write (state_fd, get_buf_data(buffer), - get_buf_offset(buffer)) != - get_buf_offset(buffer)) { - error ("Can't save state, error writing file %s %m", - qsw_state_file); - error_code = errno; - } - close (state_fd); - } - xfree (qsw_state_file); - if (buffer) - free_buf(buffer); - if (old_state) - qsw_free_libstate(old_state); -#endif /* HAVE_ELAN */ - return error_code; + return switch_save(slurmctld_conf.state_save_location); } + diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 8bc9e9a7e77..00120d60f97 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -45,10 +45,6 @@ #include <sys/types.h> #include <unistd.h> -#ifdef HAVE_ELAN -# include "src/common/qsw.h" -#endif /* HAVE_ELAN */ - #ifdef WITH_PTHREADS # include <pthread.h> #endif /* WITH_PTHREADS */ @@ -62,6 +58,7 @@ #include "src/common/pack.h" #include "src/common/slurm_cred.h" #include "src/common/slurm_protocol_api.h" +#include "src/common/switch.h" #include "src/common/xmalloc.h" #define FREE_NULL_BITMAP(_X) \ @@ -314,9 +311,7 @@ struct step_record { time_t time_last_active; /* time of last job activity */ uint16_t port; /* port for srun communications */ char *host; /* host for srun communications */ -#ifdef HAVE_ELAN - qsw_jobinfo_t qsw_job; /* Elan3 switch context, opaque */ -#endif + switch_jobinfo_t switch_job; /* switch context, opaque */ }; typedef struct job_step_specs step_specs; diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 310efebc103..304521085cf 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -37,13 +37,10 @@ #include <string.h> #include <unistd.h> -#ifdef HAVE_ELAN -# include "src/common/qsw.h" -#endif - #include <slurm/slurm_errno.h> #include "src/common/bitstring.h" +#include "src/common/switch.h" #include "src/common/xstring.h" #include "src/slurmctld/agent.h" #include "src/slurmctld/locks.h" @@ -97,9 +94,7 @@ delete_all_step_records (struct job_record *job_ptr) last_job_update = time(NULL); while ((step_ptr = (struct step_record *) list_next (step_iterator))) { list_remove (step_iterator); -#ifdef HAVE_ELAN - qsw_free_jobinfo (step_ptr->qsw_job); -#endif + switch_free_jobinfo(step_ptr->switch_job); xfree(step_ptr->host); xfree(step_ptr->step_node_list); FREE_NULL_BITMAP(step_ptr->step_node_bitmap); @@ -132,9 +127,7 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id) while ((step_ptr = (struct step_record *) list_next (step_iterator))) { if (step_ptr->step_id == step_id) { list_remove (step_iterator); -#ifdef HAVE_ELAN - qsw_free_jobinfo (step_ptr->qsw_job); -#endif + switch_free_jobinfo (step_ptr->switch_job); xfree(step_ptr->host); xfree(step_ptr->step_node_list); FREE_NULL_BITMAP(step_ptr->step_node_bitmap); @@ -495,10 +488,6 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, bitstr_t *nodeset; int node_count; time_t now = time(NULL); -#ifdef HAVE_ELAN - int first, last, i, node_id; - int node_set_size = QSW_MAX_TASKS; /* overkill but safe */ -#endif *new_step_record = NULL; job_ptr = find_job_record (step_specs->job_id); @@ -516,11 +505,9 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, (job_ptr->end_time <= time(NULL))) return ESLURM_ALREADY_DONE; -#ifdef HAVE_ELAN if ((step_specs->task_dist != SLURM_DIST_CYCLIC) && (step_specs->task_dist != SLURM_DIST_BLOCK)) return ESLURM_BAD_DIST; -#endif if (job_ptr->kill_on_step_done) /* Don't start more steps, job already being cancelled */ @@ -542,10 +529,6 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, if ((step_specs->num_tasks < 1) || (step_specs->num_tasks > (node_count*MAX_TASKS_PER_NODE))) return ESLURM_BAD_TASK_COUNT; -#ifdef HAVE_ELAN - if (step_specs->num_tasks > node_set_size) - return ESLURM_BAD_TASK_COUNT; -#endif step_ptr = create_step_record (job_ptr); if (step_ptr == NULL) @@ -561,39 +544,16 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, step_ptr->port = step_specs->port; step_ptr->host = xstrdup(step_specs->host); -#ifdef HAVE_ELAN - if (qsw_alloc_jobinfo (&step_ptr->qsw_job) < 0) - fatal ("step_create: qsw_alloc_jobinfo error"); - first = bit_ffs (step_ptr->step_node_bitmap); - last = bit_fls (step_ptr->step_node_bitmap); - nodeset = bit_alloc (node_set_size); - if (nodeset == NULL) - fatal ("step_create: bit_alloc error"); - for (i = first; i <= last; i++) { - if (bit_test (step_ptr->step_node_bitmap, i)) { - node_id = qsw_getnodeid_byhost ( - node_record_table_ptr[i].name); - if (node_id >= 0) /* no lookup error */ - bit_set(nodeset, node_id); - else { - error ("qsw_getnodeid_byhost lookup failure on %s", - node_record_table_ptr[i].name); - delete_step_record (job_ptr, - step_ptr->step_id); - bit_free (nodeset); - return ESLURM_INTERCONNECT_FAILURE; - } - } - } - if (qsw_setup_jobinfo (step_ptr->qsw_job, step_specs->num_tasks, - nodeset, step_ptr->cyclic_alloc) < 0) { - error ("step_create: qsw_setup_jobinfo error %m"); + if (switch_alloc_jobinfo (&step_ptr->switch_job) < 0) + fatal ("step_create: switch_alloc_jobinfo error"); + if (switch_build_jobinfo(step_ptr->switch_job, + step_ptr->step_node_list, + step_specs->num_tasks, + step_ptr->cyclic_alloc) < 0) { + error("switch_build_jobinfo: %m"); delete_step_record (job_ptr, step_ptr->step_id); - bit_free (nodeset); return ESLURM_INTERCONNECT_FAILURE; } - bit_free (nodeset); -#endif *new_step_record = step_ptr; return SLURM_SUCCESS; diff --git a/src/slurmd/Makefile.am b/src/slurmd/Makefile.am index 6c0e827f352..a2f0169f163 100644 --- a/src/slurmd/Makefile.am +++ b/src/slurmd/Makefile.am @@ -8,12 +8,6 @@ sbin_PROGRAMS = slurmd INCLUDES = -I$(top_srcdir) $(SSL_CPPFLAGS) -if WITH_ELAN -interconnect_sources = elan_interconnect.c -else -interconnect_sources = no_interconnect.c -endif - slurmd_LDADD = \ $(top_builddir)/src/common/libcommon.la \ $(top_builddir)/src/common/libdaemonize.la \ @@ -21,7 +15,7 @@ slurmd_LDADD = \ $(SSL_LIBS) -common_sources = \ +slurmd_SOURCES = \ slurmd.c slurmd.h \ req.c req.h \ mgr.c mgr.h \ @@ -35,18 +29,10 @@ common_sources = \ shm.c shm.h \ fname.c fname.h \ ulimits.c ulimits.h \ - setenvpf.c setenvpf.h \ - setproctitle.c setproctitle.h \ - interconnect.h - -slurmd_SOURCES = $(common_sources) $(interconnect_sources) + setproctitle.c setproctitle.h slurmd_LDFLAGS = -export-dynamic -EXTRA_slurmd_SOURCES = \ - no_interconnect.c \ - elan_interconnect.c - force: $(slurmd_LDADD) : force @cd `dirname $@` && $(MAKE) `basename $@` diff --git a/src/slurmd/elan_interconnect.c b/src/slurmd/elan_interconnect.c deleted file mode 100644 index ecf503ef675..00000000000 --- a/src/slurmd/elan_interconnect.c +++ /dev/null @@ -1,322 +0,0 @@ -/*****************************************************************************\ - * src/slurmd/elan_interconnect.c Elan interconnect implementation - ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Kevin Tew <tew1@llnl.gov> - * and Mark Grondona <mgrondona@llnl.gov> - * UCRL-CODE-2002-040. - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. -\*****************************************************************************/ - -#include <sys/types.h> -#include <sys/wait.h> -#include <stdio.h> -#include <unistd.h> -#include <stdlib.h> -#include <string.h> -#include <stdarg.h> -#include <pthread.h> - - -#include <slurm/slurm_errno.h> - -#include "src/common/xmalloc.h" -#include "src/common/xstring.h" -#include "src/common/bitstring.h" -#include "src/common/log.h" -#include "src/common/list.h" -#include "src/common/hostlist.h" -#include "src/common/qsw.h" -#include "src/common/slurm_protocol_api.h" -#include "src/common/elanhosts.h" - -#include "src/slurmd/interconnect.h" -#include "src/slurmd/setenvpf.h" - -#ifdef HAVE_LIBELAN3 -#include <elan3/elan3.h> -/* - * Static prototypes for network error resolver creation: - */ -static int set_elan_ids(void); -static void *neterr_thr(void *arg); - -static int neterr_retval = 0; -static pthread_t neterr_tid; -static pthread_mutex_t neterr_mutex = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t neterr_cond = PTHREAD_COND_INITIALIZER; - -#endif /* HAVE_LIBELAN3 */ - - -/* Initialize node for use of the Elan interconnect by loading - * elanid/hostname pairs then spawning the Elan network error - * resover thread. - * - * Main thread waits for neterr thread to successfully start before - * continuing. - */ -int interconnect_node_init(void) -{ -#if HAVE_LIBELAN3 - int err = 0; - pthread_attr_t attr; - - /* - * We only know how to do this for Elan3 right now - */ - - /* - * Load neterr elanid/hostname values into kernel - */ - if (set_elan_ids() < 0) - return SLURM_FAILURE; - - if ((err = pthread_attr_init(&attr))) - error("pthread_attr_init: %s", slurm_strerror(err)); - - err = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); - if (err) - error("pthread_attr_setdetachstate: %s", slurm_strerror(err)); - - slurm_mutex_lock(&neterr_mutex); - - if ((err = pthread_create(&neterr_tid, &attr, neterr_thr, NULL))) - return SLURM_FAILURE; - - /* - * Wait for successful startup of neterr thread before - * returning control to slurmd. - */ - pthread_cond_wait(&neterr_cond, &neterr_mutex); - pthread_mutex_unlock(&neterr_mutex); - - return neterr_retval; - -#else /* !HAVE_LIBELAN3 */ - - return SLURM_SUCCESS; -#endif /* HAVE_LIBELAN3 */ - -} - -#if HAVE_LIBELAN3 -static void *neterr_thr(void *arg) -{ - debug3("Starting Elan network error resolver thread"); - - if (!elan3_init_neterr_svc(0)) { - error("elan3_init_neterr_svc: %m"); - goto fail; - } - - /* - * Attempt to register the neterr svc thread. If the address - * cannot be bound, then there is already a thread running, and - * we should just exit with success. - */ - if (!elan3_register_neterr_svc()) { - if (errno != EADDRINUSE) { - error("elan3_register_neterr_svc: %m"); - goto fail; - } - info("Warning: Elan error resolver thread already running"); - } - - /* - * Signal main thread that we've successfully initialized - */ - slurm_mutex_lock(&neterr_mutex); - neterr_retval = 0; - pthread_cond_signal(&neterr_cond); - slurm_mutex_unlock(&neterr_mutex); - - /* - * Run the network error resolver thread. This should - * never return. If it does, there's not much we can do - * about it. - */ - elan3_run_neterr_svc(); - - return NULL; - - fail: - slurm_mutex_lock(&neterr_mutex); - neterr_retval = SLURM_FAILURE; - pthread_cond_signal(&neterr_cond); - slurm_mutex_unlock(&neterr_mutex); - - return NULL; -} -#endif /* HAVE_LIBELAN3 */ - - -/* - * Called from slurmd just before termination. - * We don't really need to do anything special for Elan, but - * we'll call pthread_cancel() on the neterr resolver thread anyhow. - */ -int -interconnect_node_fini(void) -{ -#if HAVE_LIBELAN3 - int err = pthread_cancel(neterr_tid); - if (err == 0) - return SLURM_SUCCESS; - - error("Unable to cancel neterr thread: %s", slurm_strerror(err)); - return SLURM_FAILURE; -#endif /* HAVE_LIBELAN3 */ - return SLURM_SUCCESS; -} - - -static int -_wait_and_destroy_prg(qsw_jobinfo_t qsw_job) -{ - int i = 0; - int sleeptime = 1; - - debug("going to destroy program description..."); - - while((qsw_prgdestroy(qsw_job) < 0) && (errno == ECHILD_PRGDESTROY)) { - debug("qsw_prgdestroy: %m"); - i++; - if (i == 1) { - debug("sending SIGTERM to remaining tasks"); - qsw_prgsignal(qsw_job, SIGTERM); - } else { - debug("sending SIGKILL to remaining tasks"); - qsw_prgsignal(qsw_job, SIGKILL); - } - - debug("sleeping for %d sec ...", sleeptime); - sleep(sleeptime*=2); - } - - debug("destroyed program description"); - return SLURM_SUCCESS; -} - - -int -interconnect_preinit(slurmd_job_t *job) -{ - return SLURM_SUCCESS; -} - -/* - * prepare node for interconnect use - */ -int -interconnect_init(slurmd_job_t *job) -{ - char buf[4096]; - - debug2("calling interconnect_init from process %lu", - (unsigned long) getpid()); - verbose("ELAN: %s", qsw_capability_string(job->qsw_job, buf, 4096)); - - if (qsw_prog_init(job->qsw_job, job->uid) < 0) { - /* - * Check for EBADF, which probably means the rms - * kernel module is not loaded. - */ - if (errno == EBADF) - error("Initializing interconnect: " - "is the rms kernel module loaded?"); - else - error ("elan_interconnect_init: %m"); - - qsw_print_jobinfo(log_fp(), job->qsw_job); - - return SLURM_ERROR; - } - - return SLURM_SUCCESS; -} - -int -interconnect_fini(slurmd_job_t *job) -{ - qsw_prog_fini(job->qsw_job); - return SLURM_SUCCESS; -} - -int -interconnect_postfini(slurmd_job_t *job) -{ - _wait_and_destroy_prg(job->qsw_job); - return SLURM_SUCCESS; -} - -int -interconnect_attach(slurmd_job_t *job, int procid) -{ - int nodeid, nnodes, nprocs; - int rank = job->task[procid]->gid; - - nodeid = job->nodeid; - nnodes = job->nnodes; - nprocs = job->nprocs; - - debug3("nodeid=%d nnodes=%d procid=%d nprocs=%d", - nodeid, nnodes, procid, nprocs); - debug3("setting capability in process %lu", (unsigned long) getpid()); - if (qsw_setcap(job->qsw_job, procid) < 0) { - error("qsw_setcap: %m"); - return SLURM_ERROR; - } - - if (setenvpf(&job->env, "RMS_RANK", "%d", rank ) < 0) - return -1; - if (setenvpf(&job->env, "RMS_NODEID", "%d", job->nodeid) < 0) - return -1; - if (setenvpf(&job->env, "RMS_PROCID", "%d", rank ) < 0) - return -1; - if (setenvpf(&job->env, "RMS_NNODES", "%d", job->nnodes) < 0) - return -1; - if (setenvpf(&job->env, "RMS_NPROCS", "%d", job->nprocs) < 0) - return -1; - - return SLURM_SUCCESS; -} - - -#if HAVE_LIBELAN3 - -static int -set_elan_ids(void) -{ - int i; - - for (i = 0; i <= qsw_maxnodeid(); i++) { - char host[256]; - if (qsw_gethost_bynodeid(host, 256, i) < 0) - continue; - - if (elan3_load_neterr_svc(i, host) < 0) - error("elan3_load_neterr_svc(%d, %s): %m", i, host); - } - - return 0; -} - -#endif diff --git a/src/slurmd/interconnect.h b/src/slurmd/interconnect.h deleted file mode 100644 index c5461df7573..00000000000 --- a/src/slurmd/interconnect.h +++ /dev/null @@ -1,116 +0,0 @@ -/*****************************************************************************\ - * src/slurmd/interconnect.h - general interconnect routines for slurmd - * $Id$ - ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Kevin Tew <tew1@llnl.gov> - * modified by Mark Grondona <mgrondona@llnl.gov> - * UCRL-CODE-2002-040. - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. -\*****************************************************************************/ - -#ifndef _INTERCONNECT_H_ -#define _INTERCONNECT_H_ - -#include "src/slurmd/job.h" - -/* - * Setup node for interconnect use. - * - * This function is run from the top level slurmd only once per - * slurmd run. It may be used, for instance, to perform some one-time - * interconnect setup or spawn an error handling thread. - * - */ -int interconnect_node_init(void); - -/* - * Finalize interconnect on node. - * - * This function is called once as slurmd exits (slurmd will wait for - * this function to return before continuing the exit process) - */ -int interconnect_node_fini(void); - - -/* - * Notes on job related interconnect functions: - * - * Interconnect functions are run within slurmd in the following way: - * (Diagram courtesy of Jim Garlick [see qsw.c] ) - * - * Process 1 (root) Process 2 (root, user) | Process 3 (user task) - * | - * interconnect_preinit | - * fork ------------------ interconnect_init | - * waitpid setuid, chdir, etc. | - * fork N procs -----------+--- interconnect_attach - * wait all | exec mpi process - * interconnect_fini* | - * interconnect_postfini | - * | - * - * [ *Note: interconnect_fini() is run as the uid of the job owner, not root ] - */ -/* - * Prepare node for job. - * - * pre is run as root in the first slurmd process, the so called job - * manager. This function can be used to perform any initialization - * that needs to be performed in the same process as interconnect_fini() - * - */ -int interconnect_preinit(slurmd_job_t *job); - -/* - * initialize interconnect on node for job. This function is run from the - * 2nd slurmd process (some interconnect implementations may require - * interconnect init functions to be executed from a separate process - * than the process executing initerconnect_fini() [e.g. QsNet]) - * - */ -int interconnect_init(slurmd_job_t *job); - -/* - * This function is run from the same process as interconnect_init() - * after all job tasks have exited. It is *not* run as root, because - * the process in question has already setuid to the job owner. - * - */ -int interconnect_fini(slurmd_job_t *job); - -/* - * Finalize interconnect on node. - * - * This function is run from the initial slurmd process (same process - * as interconnect_preinit()), and is run as root. Any cleanup routines - * that need to be run with root privileges should be run from this - * function. - */ -int interconnect_postfini(slurmd_job_t *job); - -/* - * attach process to interconnect - * (Called from within the process, so it is appropriate to set - * interconnect specific environment variables here) - */ -int interconnect_attach(slurmd_job_t *job, int taskid); - -#endif /* _INTERCONNECT_H */ diff --git a/src/slurmd/job.c b/src/slurmd/job.c index 047d2116a47..0f0efaac024 100644 --- a/src/slurmd/job.c +++ b/src/slurmd/job.c @@ -137,9 +137,7 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) memcpy(&io_addr, cli_addr, sizeof(slurm_addr)); slurm_set_addr(&io_addr, msg->io_port, NULL); -#ifdef HAVE_ELAN - job->qsw_job = msg->qsw_job; -#endif + job->switch_job = msg->switch_job; job->objs = list_create((ListDelF) io_obj_destroy); job->eio = eio_handle_create(); diff --git a/src/slurmd/job.h b/src/slurmd/job.h index 280ed30b4f8..baff2684613 100644 --- a/src/slurmd/job.h +++ b/src/slurmd/job.h @@ -38,7 +38,7 @@ #include "src/common/slurm_protocol_api.h" #include "src/common/list.h" #include "src/common/eio.h" - +#include "src/common/switch.h" #ifndef MAXHOSTNAMELEN #define MAXHOSTNAMELEN 64 @@ -110,9 +110,7 @@ typedef struct slurmd_job { char **env; /* job environment */ char **argv; /* job argument vector */ char *cwd; /* path to current working directory */ -#ifdef HAVE_ELAN - qsw_jobinfo_t qsw_job; /* Elan-specific job information */ -#endif + switch_jobinfo_t switch_job; /* switch-specific job information */ uid_t uid; /* user id for job */ gid_t gid; /* group ID for job */ diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index 47db96942c3..dc6ffa919b4 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -58,6 +58,8 @@ #include "src/common/log.h" #include "src/common/fd.h" #include "src/common/safeopen.h" +#include "src/common/setenvpf.h" +#include "src/common/switch.h" #include "src/common/xsignal.h" #include "src/common/xstring.h" #include "src/common/xmalloc.h" @@ -66,12 +68,10 @@ #include "src/slurmd/mgr.h" #include "src/slurmd/slurmd.h" -#include "src/slurmd/setenvpf.h" #include "src/slurmd/setproctitle.h" #include "src/slurmd/smgr.h" #include "src/slurmd/io.h" #include "src/slurmd/shm.h" -#include "src/slurmd/interconnect.h" /* @@ -428,7 +428,8 @@ _job_mgr(slurmd_job_t *job) goto fail0; } - if (!job->batch && (interconnect_preinit(job) < 0)) { + if (!job->batch && + (interconnect_preinit(job->switch_job) < 0)) { rc = ESLURM_INTERCONNECT_FAILURE; goto fail1; } @@ -482,7 +483,9 @@ _job_mgr(slurmd_job_t *job) * is moved behind wait_for_io(), we may block waiting for IO * on a hung process. */ - if (!job->batch && (interconnect_postfini(job) < 0)) + if (!job->batch && + (interconnect_postfini(job->switch_job, job->smgr_pid, + job->jobid, job->stepid) < 0)) error("interconnect_postfini: %m"); /* diff --git a/src/slurmd/no_interconnect.c b/src/slurmd/no_interconnect.c deleted file mode 100644 index f55f9116be8..00000000000 --- a/src/slurmd/no_interconnect.c +++ /dev/null @@ -1,80 +0,0 @@ -/*****************************************************************************\ - * no_interconnect.c - Manage user task communications without an high-speed - * interconnect - ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Kevin Tew <tew1@llnl.gov> et. al. - * UCRL-CODE-2002-040. - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with ConMan; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. -\*****************************************************************************/ - -#include <sys/types.h> -#include <signal.h> - -#include <src/slurmd/interconnect.h> -#include <src/slurmd/setenvpf.h> - -int interconnect_node_init (void) -{ - return SLURM_SUCCESS; -} - -int interconnect_node_fini (void) -{ - return SLURM_SUCCESS; -} - -int interconnect_preinit (slurmd_job_t *job) -{ - return SLURM_SUCCESS; -} - -int interconnect_init (slurmd_job_t *job) -{ - return SLURM_SUCCESS; -} - -int interconnect_attach (slurmd_job_t *job, int taskid) -{ - return SLURM_SUCCESS; -} - -int interconnect_fini (slurmd_job_t *job) -{ - return SLURM_SUCCESS; -} - -int interconnect_postfini (slurmd_job_t *job) -{ - /* - * Kill all processes in the job's session - */ - if (job->smgr_pid) { - debug2("Sending SIGKILL to pgid %lu", - (unsigned long) job->smgr_pid); - kill(-job->smgr_pid, SIGKILL); - } else - debug("Job %u.%u: Bad pid valud %lu", job->jobid, - job->stepid, (unsigned long) job->smgr_pid); - - return SLURM_SUCCESS; -} - - diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index 4b91c61e468..450df193d69 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -60,7 +60,6 @@ #include "src/slurmd/req.h" #include "src/slurmd/shm.h" #include "src/slurmd/setproctitle.h" -#include "src/slurmd/interconnect.h" #include "src/slurmd/get_mach_stat.h" #define GETOPT_ARGS "L:f:Dvhc" diff --git a/src/slurmd/smgr.c b/src/slurmd/smgr.c index 59562ca7fdb..6f0884d0c5a 100644 --- a/src/slurmd/smgr.c +++ b/src/slurmd/smgr.c @@ -51,12 +51,12 @@ #include "src/common/fd.h" #include "src/common/log.h" +#include "src/common/setenvpf.h" +#include "src/common/switch.h" #include "src/common/xsignal.h" #include "src/slurmd/smgr.h" #include "src/slurmd/ulimits.h" -#include "src/slurmd/interconnect.h" -#include "src/slurmd/setenvpf.h" #include "src/slurmd/io.h" /* @@ -143,7 +143,8 @@ _session_mgr(slurmd_job_t *job) /* * Call interconnect_init() before becoming user */ - if (!job->batch && (interconnect_init(job) < 0)) { + if (!job->batch && + (interconnect_init(job->switch_job, job->uid) < 0)) { /* error("interconnect_init: %m"); already logged */ exit(1); } @@ -187,7 +188,8 @@ _session_mgr(slurmd_job_t *job) _wait_for_all_tasks(job); - if (!job->batch && (interconnect_fini(job) < 0)) { + if (!job->batch && + (interconnect_fini(job->switch_job) < 0)) { error("interconnect_fini: %m"); exit(1); } @@ -297,7 +299,9 @@ _exec_task(slurmd_job_t *job, int i) } if (!job->batch) { - if (interconnect_attach(job, i) < 0) { + if (interconnect_attach(job->switch_job, &job->env, + job->nodeid, i, job->nnodes, + job->nprocs, job->task[i]->gid) < 0) { error("Unable to attach to interconnect: %m"); exit(1); } diff --git a/src/slurmd/ulimits.c b/src/slurmd/ulimits.c index b8e6a0622f7..3feb5f196b2 100644 --- a/src/slurmd/ulimits.c +++ b/src/slurmd/ulimits.c @@ -35,10 +35,10 @@ #include <string.h> #include "src/common/log.h" +#include "src/common/setenvpf.h" /* For unsetenvp() */ #include "src/common/xmalloc.h" #include "src/slurmd/job.h" -#include "src/slurmd/setenvpf.h" /* For unsetenvp() */ struct userlim { char *var; diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 2f382fc47b5..94c6273cf3f 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -328,9 +328,7 @@ create_job_step(job_t *job) job->stepid = resp->job_step_id; job->cred = resp->cred; -#ifdef HAVE_ELAN - job->qsw_job = resp->qsw_job; -#endif + job->switch_job = resp->switch_job; /* * Recreate filenames which may depend upon step id */ diff --git a/src/srun/job.h b/src/srun/job.h index 177f6c2b6b9..b3972c40d5e 100644 --- a/src/srun/job.h +++ b/src/srun/job.h @@ -128,9 +128,7 @@ typedef struct srun_job { task_state_t *task_state; /* ntask task states */ pthread_mutex_t task_mutex; -#ifdef HAVE_ELAN - qsw_jobinfo_t qsw_job; -#endif + switch_jobinfo_t switch_job; io_filename_t *ifname; io_filename_t *ofname; io_filename_t *efname; diff --git a/src/srun/launch.c b/src/srun/launch.c index 449f8ebcb74..2975bc0d6d5 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -237,10 +237,7 @@ launch(void *arg) m->msg_type = REQUEST_LAUNCH_TASKS; m->data = &msg_array_ptr[i]; memcpy(&m->address, &job->slurmd_addr[i], sizeof(slurm_addr)); - -#ifdef HAVE_ELAN - r->qsw_job = job->qsw_job; -#endif + r->switch_job = job->switch_job; #ifdef HAVE_TOTALVIEW if (opt.totalview) diff --git a/src/srun/srun.c b/src/srun/srun.c index 958341c2d1a..ea0db0fc960 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -51,6 +51,7 @@ #include "src/common/fd.h" #include "src/common/log.h" #include "src/common/slurm_protocol_api.h" +#include "src/common/switch.h" #include "src/common/xmalloc.h" #include "src/common/xsignal.h" #include "src/common/xstring.h" @@ -92,12 +93,7 @@ static int _run_batch_job (void); static void _run_job_script(job_t *job); static int _set_batch_script_env(job_t *job); static int _set_rlimit_env(void); - -#ifdef HAVE_ELAN -# include "src/common/qsw.h" - static void _qsw_standalone(job_t *job); -#endif - +static void _switch_standalone(job_t *job); #if HAVE_TOTALVIEW int srun(int ac, char **av) @@ -143,9 +139,7 @@ int main(int ac, char **av) info("do not allocate resources"); sig_setup_sigmask(); job = job_create_noalloc(); -#ifdef HAVE_ELAN - _qsw_standalone(job); -#endif + _switch_standalone(job); } else if ( (resp = existing_allocation()) ) { if (opt.allocate) { @@ -257,27 +251,17 @@ int main(int ac, char **av) } -#ifdef HAVE_ELAN static void -_qsw_standalone(job_t *job) +_switch_standalone(job_t *job) { - int i; - bitstr_t bit_decl(nodeset, QSW_MAX_TASKS); - bool cyclic = (opt.distribution == SRUN_DIST_CYCLIC); - - for (i = 0; i < job->nhosts; i++) { - int nodeid; - if ((nodeid = qsw_getnodeid_byhost(job->host[i])) < 0) - fatal("qsw_getnodeid_byhost: %m"); - bit_set(nodeset, nodeid); - } + int cyclic = (opt.distribution == SRUN_DIST_CYCLIC); - if (qsw_alloc_jobinfo(&job->qsw_job) < 0) - fatal("qsw_alloc_jobinfo: %m"); - if (qsw_setup_jobinfo(job->qsw_job, opt.nprocs, nodeset, cyclic) < 0) - fatal("qsw_setup_jobinfo: %m"); + if (switch_alloc_jobinfo(&job->switch_job) < 0) + fatal("switch_alloc_jobinfo: %m"); + if (switch_build_jobinfo(job->switch_job, job->nodelist, opt.nprocs, + cyclic) < 0) + fatal("switch_build_jobinfo: %m"); } -#endif /* HAVE_ELAN */ static void diff --git a/testsuite/slurm_unit/common/Makefile.am b/testsuite/slurm_unit/common/Makefile.am index 1f212adac6e..33ae8950a66 100644 --- a/testsuite/slurm_unit/common/Makefile.am +++ b/testsuite/slurm_unit/common/Makefile.am @@ -1,6 +1,6 @@ AUTOMAKE_OPTIONS = foreign -if WITH_ELAN +if HAVE_ELAN elan_testprogs = runqsw else elan_testprogs = -- GitLab