diff --git a/NEWS b/NEWS index d8040998ae1bd12df2514ee8c1a2e4c6b118b3d4..b557e0438c493867bb77502083b2e15206d0f4cf 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,12 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 1.1.0-pre1 +============================= + -- New --enable-multiple-slurmd configure parameter to allow running + more than one copy of slurmd on a node at the same time. Only + really useful for developers. + * Changes in SLURM 1.0.2 ======================== -- Correctly report DRAINED node state as type OTHER for "sinfo --summarize". diff --git a/configure.ac b/configure.ac index 1b613fd196c30d827352f8da516d1e2a0f22b94c..3c93e15da540018e02848bdb1ef076d546d1f6d0 100644 --- a/configure.ac +++ b/configure.ac @@ -159,6 +159,27 @@ dnl Check for compilation of SLURM auth modules: dnl X_AC_MUNGE +dnl +dnl Check if multiple-slurmd support is requested and define MULTIPLE_SLURMD +dnl if it is. +dnl +AC_MSG_CHECKING(whether to enable multiple-slurmd support) +AC_ARG_ENABLE([multiple-slurmd], + AC_HELP_STRING([--enable-multiple-slurmd], [enable multiple-slurmd support]), + [ case "$enableval" in + yes) multiple_slurmd=yes ;; + no) multiple_slurmd=no ;; + *) AC_MSG_ERROR([bad value "$enableval" for --enable-multiple-slurmd]);; + esac ] +) +if test "x$multiple_slurmd" = "xyes"; then + AC_DEFINE([MULTIPLE_SLURMD], [1], [Enable multiple slurmd on one node]) + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi + + AUTHD_LIBS="-lauth -le" savedLIBS="$LIBS" savedCFLAGS="$CFLAGS" diff --git a/src/api/config_info.c b/src/api/config_info.c index 4b71ff462ed573beee2beb494c8e8fa04974c2be..9cd4434ec09d9e95e1efd63425e69a69c84722a9 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -152,8 +152,10 @@ void slurm_print_ctl_conf ( FILE* out, slurm_ctl_conf_ptr->slurmd_logfile); fprintf(out, "SlurmdPidFile = %s\n", slurm_ctl_conf_ptr->slurmd_pidfile); +#ifndef MULTIPLE_SLURMD fprintf(out, "SlurmdPort = %u\n", slurm_ctl_conf_ptr->slurmd_port); +#endif fprintf(out, "SlurmdSpoolDir = %s\n", slurm_ctl_conf_ptr->slurmd_spooldir); fprintf(out, "SlurmdTimeout = %u\n", diff --git a/src/common/read_config.c b/src/common/read_config.c index d1288feee3095bc215618154a019fbf4487b2b26..6dc003b71ba75f03b50572583a9ef970f29f7a10 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -267,6 +267,7 @@ extern char *get_conf_node_name(char *node_hostname) /* getnodename - equivalent to gethostname, but return only the first * component of the fully qualified name * (e.g. "linux123.foo.bar" becomes "linux123") + * OUT name */ int getnodename (char *name, size_t len) @@ -397,7 +398,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) ctl_conf_ptr->slurmd_debug = (uint16_t) NO_VAL; xfree (ctl_conf_ptr->slurmd_logfile); xfree (ctl_conf_ptr->slurmd_pidfile); - ctl_conf_ptr->slurmd_port = (uint32_t) NO_VAL; + ctl_conf_ptr->slurmd_port = (uint32_t) NO_VAL; xfree (ctl_conf_ptr->slurmd_spooldir); ctl_conf_ptr->slurmd_timeout = (uint16_t) NO_VAL; xfree (ctl_conf_ptr->state_save_location); @@ -933,6 +934,7 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) ctl_conf_ptr->slurmd_logfile = slurmd_logfile; } +#ifndef MULTIPLE_SLURMD if ( slurmd_port != -1) { if ( ctl_conf_ptr->slurmd_port != (uint32_t) NO_VAL) error (MULTIPLE_VALUE_MSG, "SlurmdPort"); @@ -941,6 +943,7 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) else ctl_conf_ptr->slurmd_port = slurmd_port; } +#endif if ( slurmd_spooldir ) { if ( ctl_conf_ptr->slurmd_spooldir ) { @@ -1058,12 +1061,14 @@ _parse_node_spec (char *in_line, bool slurmd_hosts) char *state = NULL, *reason=NULL; char *node_hostname = NULL; int cpus_val, real_memory_val, tmp_disk_val, weight_val; + int port; error_code = slurm_parser (in_line, "Feature=", 's', &feature, "NodeAddr=", 's', &node_addr, "NodeName=", 's', &node_name, "NodeHostname=", 's', &node_hostname, + "Port=", 'd', &port, "Procs=", 'd', &cpus_val, "RealMemory=", 'd', &real_memory_val, "Reason=", 's', &reason, @@ -1076,7 +1081,7 @@ _parse_node_spec (char *in_line, bool slurmd_hosts) return error_code; if (node_name - && (node_hostname || slurmd_hosts)) { + && (node_hostname || slurmd_hosts)) { all_slurmd_hosts = true; _register_conf_node_aliases(node_name, node_hostname); } @@ -1433,8 +1438,10 @@ validate_config (slurm_ctl_conf_t *ctl_conf_ptr) if (ctl_conf_ptr->slurmd_pidfile == NULL) ctl_conf_ptr->slurmd_pidfile = xstrdup(DEFAULT_SLURMD_PIDFILE); +#ifndef MULTIPLE_SLURMD if (ctl_conf_ptr->slurmd_port == (uint32_t) NO_VAL) ctl_conf_ptr->slurmd_port = SLURMD_PORT; +#endif if (ctl_conf_ptr->slurmd_spooldir == NULL) ctl_conf_ptr->slurmd_spooldir = xstrdup(DEFAULT_SPOOLDIR); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 68c143324084894ac9194a613e90ed9f2e6769e5..fae0b90c32ec12f9b2619f2980be6a9d560ac9d7 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1920,7 +1920,9 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) pack16(build_ptr->slurmd_debug, buffer); packstr(build_ptr->slurmd_logfile, buffer); packstr(build_ptr->slurmd_pidfile, buffer); +#ifndef MULTIPLE_SLURMD pack32(build_ptr->slurmd_port, buffer); +#endif packstr(build_ptr->slurmd_spooldir, buffer); debug2("Packing string %s", build_ptr->slurmd_spooldir); pack16(build_ptr->slurmd_timeout, buffer); @@ -2006,7 +2008,9 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** buffer); safe_unpackstr_xmalloc(&build_ptr->slurmd_pidfile, &uint16_tmp, buffer); +#ifndef MULTIPLE_SLURMD safe_unpack32(&build_ptr->slurmd_port, buffer); +#endif safe_unpackstr_xmalloc(&build_ptr->slurmd_spooldir, &uint16_tmp, buffer); safe_unpack16(&build_ptr->slurmd_timeout, buffer); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index e8336ba221f4f9b4eb14e8f8f9ce24220bd0f216..38cc10c1047a90bfd0d2196876c12e26d9457083 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -197,6 +197,7 @@ create_node_record (struct config_record *config_ptr, char *node_name) strcpy (node_ptr->name, node_name); node_ptr->node_state = default_node_record.node_state; node_ptr->last_response = default_node_record.last_response; + node_ptr->port = default_node_record.port; node_ptr->config_ptr = config_ptr; node_ptr->part_cnt = 0; node_ptr->part_pptr = NULL; @@ -803,8 +804,10 @@ void set_slurmd_addr (void) for (i = 0; i < node_record_count; i++, node_ptr++) { if (node_ptr->name[0] == '\0') continue; + if (node_ptr->port == 0) + node_ptr->port = slurmctld_conf.slurmd_port; slurm_set_addr (&node_ptr->slurm_addr, - slurmctld_conf.slurmd_port, + node_ptr->port, node_ptr->comm_name); if (node_ptr->slurm_addr.sin_port) continue; diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index f3001888415116f33addacf7ad4700e9cf43f2f2..0d5ed7b8849043edb293d842d0ff7911f8c8f199 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -337,7 +337,9 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) slurmd_logfile); conf_ptr->slurmd_pidfile = xstrdup(slurmctld_conf. slurmd_pidfile); +#ifndef MULTIPLE_SLURMD conf_ptr->slurmd_port = slurmctld_conf.slurmd_port; +#endif conf_ptr->slurmd_spooldir = xstrdup(slurmctld_conf. slurmd_spooldir); conf_ptr->slurmd_timeout = slurmctld_conf.slurmd_timeout; diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 14eddbc01775d7cc87c1c4a11d0fe07c6f229587..bb6a7a6fad55aa452ea902503d1c90902e9a2a33 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -289,10 +289,12 @@ static int _parse_node_spec(char *in_line) #ifndef HAVE_FRONT_END /* Fake node addresses for front-end */ char *this_node_addr; #endif + int port; node_addr = node_name = state = feature = (char *) NULL; cpus_val = real_memory_val = state_val = NO_VAL; tmp_disk_val = weight_val = NO_VAL; + port = NO_VAL; if ((error_code = load_string(&node_name, "NodeName=", in_line))) return error_code; if (node_name == NULL) @@ -307,6 +309,9 @@ static int _parse_node_spec(char *in_line) "Feature=", 's', &feature, "NodeAddr=", 's', &node_addr, "NodeHostname=", 's', &node_hostname, +#ifdef MULTIPLE_SLURMD + "Port=", 'd', &port, +#endif "Procs=", 'd', &cpus_val, "RealMemory=", 'd', &real_memory_val, "Reason=", 's', &reason, @@ -396,6 +401,10 @@ static int _parse_node_spec(char *in_line) default_config_record.feature = feature; feature = NULL; } +#ifdef MULTIPLE_SLURMD + if (port != NO_VAL) + default_node_record.port = port; +#endif free(this_node_name); break; } @@ -457,6 +466,10 @@ static int _parse_node_spec(char *in_line) } else strncpy(node_ptr->comm_name, node_ptr->name, MAX_NAME_LEN); +#endif +#ifdef MULTIPLE_SLURMD + if (port != NO_VAL) + node_ptr->port = port; #endif node_ptr->reason = xstrdup(reason); } else { diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 98a009ffc8f47aeda2099bdbdd4199f0f0f462ab..a860c5dbf3d9072b883f0bb2e57d4c30e66aa609 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -171,6 +171,7 @@ struct node_record { struct part_record **part_pptr; /* array of pointers to partitions * associated with this node*/ char comm_name[MAX_NAME_LEN]; /* communications path name to node */ + uint16_t port; /* TCP port number of the slurmd */ slurm_addr slurm_addr; /* network address */ uint16_t comp_job_cnt; /* count of jobs completing on node */ uint16_t run_job_cnt; /* count of jobs running on node */ diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index b977908d784749e797ff50abe7d3bc999c30e8a1..587d1046cd41ce38fcc83b6602f741c3ae28cff1 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -67,7 +67,11 @@ #include "src/slurmd/common/proctrack.h" #include "src/slurmd/common/task_plugin.h" +#ifdef MULTIPLE_SLURMD +#define GETOPT_ARGS "L:Dvhcf:MN:P:" +#else #define GETOPT_ARGS "L:Dvhcf:M" +#endif #ifndef MAXHOSTNAMELEN # define MAXHOSTNAMELEN 64 @@ -105,7 +109,6 @@ static void _create_msg_socket(); static void _msg_engine(); static int _slurmd_init(); static int _slurmd_fini(); -static void _create_conf(); static void _init_conf(); static void _print_conf(); static void _read_config(); @@ -141,7 +144,7 @@ main (int argc, char *argv[]) * Create and set default values for the slurmd global * config variable "conf" */ - _create_conf(); + conf = xmalloc(sizeof(slurmd_conf_t)); _init_conf(); conf->argv = &argv; conf->argc = &argc; @@ -473,7 +476,6 @@ _free_and_set(char **confvar, char *newval) /* * Read the slurm configuration file (slurm.conf) and substitute some * values into the slurmd configuration in preference of the defaults. - * */ static void _read_config() @@ -489,7 +491,9 @@ _read_config() if (conf->conffile == NULL) conf->conffile = xstrdup(conf->cf.slurm_conf); +#ifndef MULTIPLE_SLURMD conf->port = conf->cf.slurmd_port; +#endif conf->slurm_user_id = conf->cf.slurm_user_id; path_pubkey = xstrdup(conf->cf.job_credential_public_certificate); @@ -497,12 +501,23 @@ _read_config() if (!conf->logfile) conf->logfile = xstrdup(conf->cf.slurmd_logfile); - _free_and_set(&conf->node_name, get_conf_node_name(conf->hostname)); + /* node_name may already be set from a command line parameter */ + if (conf->node_name == NULL) + _free_and_set(&conf->node_name, + get_conf_node_name(conf->hostname)); _free_and_set(&conf->epilog, xstrdup(conf->cf.epilog)); _free_and_set(&conf->prolog, xstrdup(conf->cf.prolog)); _free_and_set(&conf->tmpfs, xstrdup(conf->cf.tmp_fs)); _free_and_set(&conf->spooldir, xstrdup(conf->cf.slurmd_spooldir)); +#ifdef MULTIPLE_SLURMD + /* append the NodeName to the spooldir to make it unique */ + xstrfmtcat(conf->spooldir, ".%s", conf->node_name); +#endif _free_and_set(&conf->pidfile, xstrdup(conf->cf.slurmd_pidfile)); +#ifdef MULTIPLE_SLURMD + /* append the NodeName to the pidfile name to make it unique */ + xstrfmtcat(conf->pidfile, ".%s", conf->node_name); +#endif _free_and_set(&conf->task_prolog, xstrdup(conf->cf.task_prolog)); _free_and_set(&conf->task_epilog, xstrdup(conf->cf.task_epilog)); _free_and_set(&conf->pubkey, path_pubkey); @@ -562,12 +577,6 @@ _print_conf() debug3("TaskEpilog = `%s'", conf->task_epilog); } -static void -_create_conf() -{ - conf = xmalloc(sizeof(slurmd_conf_t)); -} - static void _init_conf() { @@ -632,6 +641,14 @@ _process_cmdline(int ac, char **av) case 'M': conf->mlock_pages = 1; break; +#ifdef MULTIPLE_SLURMD + case 'N': + conf->node_name = xstrdup(optarg); + break; + case 'P': + conf->port = (uint16_t)atoi(optarg); + break; +#endif default: _usage(c); exit(1); @@ -721,7 +738,7 @@ _slurmd_init() /* * Need to kill any running slurmd's here */ - _kill_old_slurmd(); + _kill_old_slurmd(); stepd_cleanup_sockets(conf->spooldir, conf->node_name); } diff --git a/src/srun/launch.c b/src/srun/launch.c index f045455e4fdbfc2414dce02faf7d4f55bfa1d496..d516cf176eab7023e4c630136a623f4b4ea9cb88 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -115,7 +115,6 @@ launch(void *arg) debug("going to launch %d tasks on %d hosts", opt.nprocs, job->step_layout->num_hosts); - debug("sending to slurmd port %d", slurm_get_slurmd_port()); msg_array_ptr = xmalloc(sizeof(launch_tasks_request_msg_t) * job->step_layout->num_hosts);