diff --git a/NEWS b/NEWS index 9ea497271ea8f88a745b635ee7721f37a5b7dfcd..b338ab6925d701722b057c09a949f25263f5afbf 100644 --- a/NEWS +++ b/NEWS @@ -76,6 +76,7 @@ documents those changes that are of interest to users and administrators. -- slurm.spec - add new --with slurmsmwd option. -- pmi2: add mutex locking to all API calls to ensure thread-safety. -- Fix QOS usage factor to apply to TRES time limits and usage. + -- Fix multi-cluster srun's with Select/Cray and other_cons_res. * Changes in Slurm 19.05.0pre3 ============================== diff --git a/src/common/env.c b/src/common/env.c index 031e0a35f4bc238563f041e38431240995f65501..498caa3e67b0d00dab017483af252de20e85d8e2 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -790,9 +790,10 @@ int setup_env(env_t *env, bool preserve_env) addr = slurmctld_conf.slurmctld_addr; else addr = slurmctld_conf.control_addr[0]; - setenvf(&env->env, "SLURM_WORKING_CLUSTER", "%s:%s:%d:%d", + setenvf(&env->env, "SLURM_WORKING_CLUSTER", "%s:%s:%d:%d:%d", slurmctld_conf.cluster_name, addr, - slurmctld_conf.slurmctld_port, SLURM_PROTOCOL_VERSION); + slurmctld_conf.slurmctld_port, SLURM_PROTOCOL_VERSION, + select_get_plugin_id()); return rc; } diff --git a/src/common/working_cluster.c b/src/common/working_cluster.c index b31a9fd60d4cc8bcc8e0a6ae98e8b39801c33c46..94ed5821b25c5355e1d0a810dd5aef596bc25e0b 100644 --- a/src/common/working_cluster.c +++ b/src/common/working_cluster.c @@ -199,6 +199,9 @@ slurm_setup_remote_working_cluster(resource_allocation_response_msg_t *msg) working_cluster_rec = (slurmdb_cluster_rec_t *)msg->working_cluster_rec; msg->working_cluster_rec = NULL; + working_cluster_rec->plugin_id_select = + select_get_plugin_id_pos(working_cluster_rec->plugin_id_select); + slurm_set_addr(&working_cluster_rec->control_addr, working_cluster_rec->control_port, working_cluster_rec->control_host); diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 7dd90ddb4bea32009ff8879eb31a562ddb29e08a..0962544ce2de0bd3de266099b938a1b94bd675fd 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -60,6 +60,7 @@ #include "src/common/cli_filter.h" #include "src/common/cpu_frequency.h" #include "src/common/env.h" +#include "src/common/node_select.h" #include "src/common/plugstack.h" #include "src/common/proc_args.h" #include "src/common/read_config.h" @@ -687,11 +688,12 @@ static int _proc_alloc(resource_allocation_response_msg_t *alloc) slurm_setup_remote_working_cluster(alloc); /* set env for srun's to find the right cluster */ - setenvf(NULL, "SLURM_WORKING_CLUSTER", "%s:%s:%d:%d", + setenvf(NULL, "SLURM_WORKING_CLUSTER", "%s:%s:%d:%d:%d", working_cluster_rec->name, working_cluster_rec->control_host, working_cluster_rec->control_port, - working_cluster_rec->rpc_version); + working_cluster_rec->rpc_version, + select_get_plugin_id()); } if (!_wait_nodes_ready(alloc)) { diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 9cc95a1dca994d860214065014e62d53638c961c..4e520dd78e57424e87e2c6c822b5e458f7db846e 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -161,6 +161,7 @@ static void _set_response_cluster_rec(void) } response_cluster_rec->control_port = slurmctld_conf.slurmctld_port; response_cluster_rec->rpc_version = SLURM_PROTOCOL_VERSION; + response_cluster_rec->plugin_id_select = select_get_plugin_id(); } /* diff --git a/src/srun/srun.c b/src/srun/srun.c index c187d27771f8fb31aa51c1e657cc293ff5611140..351b46ccdc2e9254ed7c42bc225af57c3b0d641a 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -844,37 +844,43 @@ static void _pty_restore(void) static void _setup_env_working_cluster(void) { - char *working_env = NULL; + char *working_env, *addr_ptr, *port_ptr, *rpc_ptr, *select_ptr; - if ((working_env = xstrdup(getenv("SLURM_WORKING_CLUSTER")))) { - char *addr_ptr, *port_ptr, *rpc_ptr; + if ((working_env = xstrdup(getenv("SLURM_WORKING_CLUSTER"))) == NULL) + return; - if (!(addr_ptr = strchr(working_env, ':')) || - !(port_ptr = strchr(addr_ptr + 1, ':')) || - !(rpc_ptr = strchr(port_ptr + 1, ':'))) { - error("malformed cluster addr and port in SLURM_WORKING_CLUSTER env var: '%s'", - working_env); - exit(1); - } + /* Format is cluster_name:address:port:rpc[:plugin_id_select] */ + if (!(addr_ptr = strchr(working_env, ':')) || + !(port_ptr = strchr(addr_ptr + 1, ':')) || + !(rpc_ptr = strchr(port_ptr + 1, ':'))) { + error("malformed cluster addr and port in SLURM_WORKING_CLUSTER env var: '%s'", + working_env); + exit(1); + } - *addr_ptr++ = '\0'; - *port_ptr++ = '\0'; - *rpc_ptr++ = '\0'; - - if (xstrcmp(slurmctld_conf.cluster_name, working_env)) { - working_cluster_rec = - xmalloc(sizeof(slurmdb_cluster_rec_t)); - slurmdb_init_cluster_rec(working_cluster_rec, false); - - working_cluster_rec->control_host = xstrdup(addr_ptr);; - working_cluster_rec->control_port = strtol(port_ptr, - NULL, 10); - working_cluster_rec->rpc_version = strtol(rpc_ptr, - NULL, 10); - slurm_set_addr(&working_cluster_rec->control_addr, - working_cluster_rec->control_port, - working_cluster_rec->control_host); - } - xfree(working_env); + *addr_ptr++ = '\0'; + *port_ptr++ = '\0'; + *rpc_ptr++ = '\0'; + + if ((select_ptr = strchr(rpc_ptr, ':'))) + *select_ptr++ = '\0'; + + if (xstrcmp(slurmctld_conf.cluster_name, working_env)) { + working_cluster_rec = xmalloc(sizeof(slurmdb_cluster_rec_t)); + slurmdb_init_cluster_rec(working_cluster_rec, false); + + working_cluster_rec->name = xstrdup(working_env); + working_cluster_rec->control_host = xstrdup(addr_ptr); + working_cluster_rec->control_port = strtol(port_ptr, NULL, 10); + working_cluster_rec->rpc_version = strtol(rpc_ptr, NULL, 10); + slurm_set_addr(&working_cluster_rec->control_addr, + working_cluster_rec->control_port, + working_cluster_rec->control_host); + + if (select_ptr) + working_cluster_rec->plugin_id_select = + select_get_plugin_id_pos(strtol(select_ptr, + NULL, 10)); } + xfree(working_env); }