From 41efcfd38de784db04f406b7b803210dca4a16da Mon Sep 17 00:00:00 2001 From: "Adam T. Moody" <moody20@llnl.gov> Date: Wed, 22 Apr 2015 16:12:25 -0700 Subject: [PATCH] Develop a new MPI key,value exchange algorithm PMIX_Ring() --- contribs/pmi2/Makefile.am | 1 + contribs/pmi2/Makefile.in | 1 + contribs/pmi2/pmi2_api.c | 51 ++++++++++++++++++ contribs/pmi2/slurm/pmi2.h | 32 +++++++++++ contribs/pmi2/testpmixring.c | 63 ++++++++++++++++++++++ src/plugins/mpi/pmi2/Makefile.am | 3 +- src/plugins/mpi/pmi2/Makefile.in | 6 ++- src/plugins/mpi/pmi2/pmi.h | 6 +++ src/plugins/mpi/pmi2/pmi2.c | 33 +++++++++++- src/plugins/mpi/pmi2/setup.c | 7 +++ src/plugins/mpi/pmi2/tree.c | 93 ++++++++++++++++++++++++++++++++ src/plugins/mpi/pmi2/tree.h | 2 + 12 files changed, 294 insertions(+), 4 deletions(-) create mode 100644 contribs/pmi2/testpmixring.c diff --git a/contribs/pmi2/Makefile.am b/contribs/pmi2/Makefile.am index d614be091ac..90f305b5a17 100644 --- a/contribs/pmi2/Makefile.am +++ b/contribs/pmi2/Makefile.am @@ -27,6 +27,7 @@ libpmi2_la_LDFLAGS = $(LIB_LDFLAGS) -version-info $(libpmi2_current):$(libpmi2_r $(PMI2_VERSION_SCRIPT) : (echo "{ global:"; \ echo " PMI2_*;"; \ + echo " PMIX_*;"; \ echo " local: *;"; \ echo "};") > $(PMI2_VERSION_SCRIPT) diff --git a/contribs/pmi2/Makefile.in b/contribs/pmi2/Makefile.in index 8bd9a576aad..6ba87ba93fa 100644 --- a/contribs/pmi2/Makefile.in +++ b/contribs/pmi2/Makefile.in @@ -850,6 +850,7 @@ uninstall-am: uninstall-libLTLIBRARIES uninstall-pkgincludeHEADERS $(PMI2_VERSION_SCRIPT) : (echo "{ global:"; \ echo " PMI2_*;"; \ + echo " PMIX_*;"; \ echo " local: *;"; \ echo "};") > $(PMI2_VERSION_SCRIPT) diff --git a/contribs/pmi2/pmi2_api.c b/contribs/pmi2/pmi2_api.c index fc36ed2cdca..2789fbcee48 100644 --- a/contribs/pmi2/pmi2_api.c +++ b/contribs/pmi2/pmi2_api.c @@ -644,6 +644,57 @@ fn_fail: goto fn_exit; } +int PMIX_Ring(const char value[], int *rank, int *ranks, char left[], char right[], int maxvalue) +{ + int pmi2_errno = PMI2_SUCCESS; + PMI2_Command cmd = {0}; + int rc; + const char *errmsg; + int found; + const char *kvsvalue; + int kvsvallen; + + PMI2U_printf("[BEGIN PMI2_Ring]"); + + /* send message: cmd=ring_in, count=1, left=value, right=value */ + pmi2_errno = PMIi_WriteSimpleCommandStr(PMI2_fd, &cmd, RING_CMD, + RING_COUNT_KEY, "1", + RING_LEFT_KEY, value, + RING_RIGHT_KEY, value, + NULL); + if (pmi2_errno) PMI2U_ERR_POP(pmi2_errno); + + /* wait for reply: cmd=ring_out, rc=0|1, count=rank, left=leftval, right=rightval */ + pmi2_errno = PMIi_ReadCommandExp(PMI2_fd, &cmd, RINGRESP_CMD, &rc, &errmsg); + if (pmi2_errno) PMI2U_ERR_SETANDJUMP(1, pmi2_errno, "PMIi_ReadCommandExp"); + PMI2U_ERR_CHKANDJUMP(rc, pmi2_errno, PMI2_ERR_OTHER, "**pmi2_ring %s", errmsg ? errmsg : "unknown"); + + /* get our rank from the count key */ + found = getvalint(cmd.pairs, cmd.nPairs, RING_COUNT_KEY, rank); + PMI2U_ERR_CHKANDJUMP(found != 1, pmi2_errno, PMI2_ERR_OTHER, "**intern"); + + /* set size of ring (just number of procs in job) */ + *ranks = PMI2_size; + + /* lookup left value and copy to caller's buffer */ + found = getval(cmd.pairs, cmd.nPairs, RING_LEFT_KEY, &kvsvalue, &kvsvallen); + PMI2U_ERR_CHKANDJUMP(found != 1, pmi2_errno, PMI2_ERR_OTHER, "**intern"); + MPIU_Strncpy(left, kvsvalue, maxvalue); + + /* lookup right value and copy to caller's buffer */ + found = getval(cmd.pairs, cmd.nPairs, RING_RIGHT_KEY, &kvsvalue, &kvsvallen); + PMI2U_ERR_CHKANDJUMP(found != 1, pmi2_errno, PMI2_ERR_OTHER, "**intern"); + MPIU_Strncpy(right, kvsvalue, maxvalue); + +fn_exit: + free(cmd.command); + freepairs(cmd.pairs, cmd.nPairs); + PMI2U_printf("[END PMI2_Ring]"); + return pmi2_errno; +fn_fail: + goto fn_exit; +} + int PMI2_KVS_Put(const char key[], const char value[]) { int pmi2_errno = PMI2_SUCCESS; diff --git a/contribs/pmi2/slurm/pmi2.h b/contribs/pmi2/slurm/pmi2.h index ddcc36c888c..59bdad6bef6 100644 --- a/contribs/pmi2/slurm/pmi2.h +++ b/contribs/pmi2/slurm/pmi2.h @@ -52,6 +52,8 @@ static const char NAMEUNPUBLISH_CMD[] = "name-unpublish"; static const char NAMEUNPUBLISHRESP_CMD[] = "name-unpublish-response"; static const char NAMELOOKUP_CMD[] = "name-lookup"; static const char NAMELOOKUPRESP_CMD[] = "name-lookup-response"; +static const char RING_CMD[] = "ring"; +static const char RINGRESP_CMD[] = "ring-response"; static const char PMIJOBID_KEY[] = "pmijobid"; static const char PMIRANK_KEY[] = "pmirank"; @@ -81,6 +83,9 @@ static const char THRID_KEY[] = "thrid"; static const char INFOKEYCOUNT_KEY[] = "infokeycount"; static const char INFOKEY_KEY[] = "infokey%d"; static const char INFOVAL_KEY[] = "infoval%d"; +static const char RING_COUNT_KEY[] = "ring-count"; +static const char RING_LEFT_KEY[] = "ring-left"; +static const char RING_RIGHT_KEY[] = "ring-right"; static const char TRUE_VAL[] = "TRUE"; static const char FALSE_VAL[] = "FALSE"; @@ -412,6 +417,33 @@ int PMI2_Job_Connect(const char jobid[], PMI2_Connect_comm_t *conn); @*/ int PMI2_Job_Disconnect(const char jobid[]); +/*@ + PMIX_Ring - execute ring exchange over processes in group + + Input Parameters: + + value - input string + - maxvalue - max size of input and output strings + + Output Parameters: + + rank - returns caller's rank within ring + - ranks - returns number of procs within ring + - left - buffer to receive value provided by (rank - 1) % ranks + - right - buffer to receive value provided by (rank + 1) % ranks + + Return values: + Returns 'MPI_SUCCESS' on success and an MPI error code on failure. + + Notes: + This function is collective, but not necessarily synchronous, + across all processes in the process group to which the calling + process belongs. All processes in the group must call this + function, but a process may return before all processes have called + the function. + +@*/ +#define HAVE_PMIX_RING 1 /* so one can conditionally compile with this funciton */ +int PMIX_Ring(const char value[], int *rank, int *ranks, char left[], char right[], int maxvalue); + /*@ PMI2_KVS_Put - put a key/value pair in the keyval space for this job diff --git a/contribs/pmi2/testpmixring.c b/contribs/pmi2/testpmixring.c new file mode 100644 index 00000000000..825ed654dc6 --- /dev/null +++ b/contribs/pmi2/testpmixring.c @@ -0,0 +1,63 @@ + +#include <stdio.h> +#include <time.h> +#include <stdlib.h> +#include <string.h> +//#include <mpi.h> +#include <slurm/pmi2.h> +#include <sys/time.h> + +/* + * To build: + * + * gcc -g -O0 -o testpmixring testpmixring.c -I<slurm_install>/include -Wl,-rpath,<slurm_install>/lib -L<slurm_install>/lib -lpmi2 + * + * To run: + * + * srun -n8 -m block ./testpmixring + * srun -n8 -m cyclic ./testpmixring + */ + +int +main(int argc, char **argv) +{ + int spawned, size, rank, appnum; + struct timeval tv, tv2; + int ring_rank, ring_size; + char jobid[128]; + char val[128]; + char buf[128]; + char left[128]; + char right[128]; + + { + int x = 1; + + while (x) { + fprintf(stderr, "attachme %d\n", getpid()); + sleep(2); + } + } + + gettimeofday(&tv, NULL); + + PMI2_Init(&spawned, &size, &rank, &appnum); + + PMI2_Job_GetId(jobid, sizeof(buf)); + + /* test PMIX_Ring */ + snprintf(val, sizeof(val), "pmi_rank=%d", rank); + PMIX_Ring(val, &ring_rank, &ring_size, left, right, 128); + + printf("pmi_rank:%d ring_rank:%d ring_size:%d left:%s mine:%s right:%s\n", + rank, ring_rank, ring_size, left, val, right); + + PMI2_Finalize(); + + gettimeofday(&tv2, NULL); + printf("%f\n", + ((tv2.tv_sec - tv.tv_sec) * 1000.0 + + (tv2.tv_usec - tv.tv_usec) / 1000.0)); + + return 0; +} diff --git a/src/plugins/mpi/pmi2/Makefile.am b/src/plugins/mpi/pmi2/Makefile.am index 1b1fbd99274..a7df063a373 100644 --- a/src/plugins/mpi/pmi2/Makefile.am +++ b/src/plugins/mpi/pmi2/Makefile.am @@ -17,7 +17,8 @@ mpi_pmi2_la_SOURCES = mpi_pmi2.c \ setup.c setup.h \ spawn.c spawn.h \ tree.c tree.h \ - nameserv.c nameserv.h + nameserv.c nameserv.h \ + ring.c ring.h mpi_pmi2_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/mpi/pmi2/Makefile.in b/src/plugins/mpi/pmi2/Makefile.in index 033f888a9aa..a52d25d1505 100644 --- a/src/plugins/mpi/pmi2/Makefile.in +++ b/src/plugins/mpi/pmi2/Makefile.in @@ -164,7 +164,7 @@ am__installdirs = "$(DESTDIR)$(pkglibdir)" LTLIBRARIES = $(pkglib_LTLIBRARIES) mpi_pmi2_la_DEPENDENCIES = $(top_builddir)/src/slurmd/common/libslurmd_reverse_tree_math.la am_mpi_pmi2_la_OBJECTS = mpi_pmi2.lo agent.lo client.lo kvs.lo info.lo \ - pmi1.lo pmi2.lo setup.lo spawn.lo tree.lo nameserv.lo + pmi1.lo pmi2.lo setup.lo spawn.lo tree.lo nameserv.lo ring.lo mpi_pmi2_la_OBJECTS = $(am_mpi_pmi2_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -488,7 +488,8 @@ mpi_pmi2_la_SOURCES = mpi_pmi2.c \ setup.c setup.h \ spawn.c spawn.h \ tree.c tree.h \ - nameserv.c nameserv.h + nameserv.c nameserv.h \ + ring.c ring.h mpi_pmi2_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) mpi_pmi2_la_LIBADD = \ @@ -581,6 +582,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nameserv.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pmi1.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pmi2.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ring.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/setup.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/spawn.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tree.Plo@am__quote@ diff --git a/src/plugins/mpi/pmi2/pmi.h b/src/plugins/mpi/pmi2/pmi.h index 3703870fc0a..b60592b6406 100644 --- a/src/plugins/mpi/pmi2/pmi.h +++ b/src/plugins/mpi/pmi2/pmi.h @@ -152,6 +152,8 @@ #define NAMELOOKUPRESP_CMD "name-lookup-response" #define SPAWN_CMD "spawn" #define SPAWNRESP_CMD "spawn-response" +#define RING_CMD "ring" +#define RINGRESP_CMD "ring-response" #define GETMYKVSNAME_CMD "get_my_kvsname" #define GETMYKVSNAMERESP_CMD "my_kvsname" @@ -208,6 +210,9 @@ #define ERRCODES_KEY "errcodes" #define SERVICE_KEY "service" #define INFO_KEY "info" +#define RING_COUNT_KEY "ring-count" +#define RING_LEFT_KEY "ring-left" +#define RING_RIGHT_KEY "ring-right" #define TRUE_VAL "TRUE" #define FALSE_VAL "FALSE" @@ -232,6 +237,7 @@ #define PMI2_PPKEY_ENV "SLURM_PMI2_PPKEY" #define PMI2_PPVAL_ENV "SLURM_PMI2_PPVAL" #define SLURM_STEP_RESV_PORTS "SLURM_STEP_RESV_PORTS" +#define PMIX_RING_TREE_WIDTH_ENV "SLURM_PMIX_RING_WIDTH" /* old PMIv1 envs */ #define PMI2_PMI_DEBUGGED_ENV "PMI_DEBUG" #define PMI2_KVS_NO_DUP_KEYS_ENV "SLURM_PMI_KVS_NO_DUP_KEYS" diff --git a/src/plugins/mpi/pmi2/pmi2.c b/src/plugins/mpi/pmi2/pmi2.c index d960853defc..84234507b7c 100644 --- a/src/plugins/mpi/pmi2/pmi2.c +++ b/src/plugins/mpi/pmi2/pmi2.c @@ -60,6 +60,7 @@ #include "setup.h" #include "agent.h" #include "nameserv.h" +#include "ring.h" /* PMI2 command handlers */ static int _handle_fullinit(int fd, int lrank, client_req_t *req); @@ -68,6 +69,7 @@ static int _handle_abort(int fd, int lrank, client_req_t *req); static int _handle_job_getid(int fd, int lrank, client_req_t *req); static int _handle_job_connect(int fd, int lrank, client_req_t *req); static int _handle_job_disconnect(int fd, int lrank, client_req_t *req); +static int _handle_ring(int fd, int lrank, client_req_t *req); static int _handle_kvs_put(int fd, int lrank, client_req_t *req); static int _handle_kvs_fence(int fd, int lrank, client_req_t *req); static int _handle_kvs_get(int fd, int lrank, client_req_t *req); @@ -90,6 +92,7 @@ static struct { { JOBGETID_CMD, _handle_job_getid }, { JOBCONNECT_CMD, _handle_job_connect }, { JOBDISCONNECT_CMD, _handle_job_disconnect }, + { RING_CMD, _handle_ring }, { KVSPUT_CMD, _handle_kvs_put }, { KVSFENCE_CMD, _handle_kvs_fence }, { KVSGET_CMD, _handle_kvs_get }, @@ -103,7 +106,6 @@ static struct { { NULL, NULL}, }; - static int _handle_fullinit(int fd, int lrank, client_req_t *req) { @@ -227,6 +229,35 @@ _handle_job_disconnect(int fd, int lrank, client_req_t *req) return rc; } +static int +_handle_ring(int fd, int lrank, client_req_t *req) +{ + int rc = SLURM_SUCCESS; + int count = 0; + char *left = NULL; + char *right = NULL; + + debug3("mpi/pmi2: in _handle_ring"); + + /* extract left, right, and count values from ring payload */ + client_req_parse_body(req); + client_req_get_int(req, RING_COUNT_KEY, &count); + client_req_get_str(req, RING_LEFT_KEY, &left); + client_req_get_str(req, RING_RIGHT_KEY, &right); + + /* compute ring_id, we list all application tasks first, + * followed by stepds, so here we just use the application + * process rank */ + int ring_id = lrank; + + rc = pmix_ring_in(ring_id, count, left, right); + + /* the repsonse is sent back to client from the pmix_ring_out call */ + + debug3("mpi/pmi2: out _handle_ring"); + return rc; +} + static int _handle_kvs_put(int fd, int lrank, client_req_t *req) { diff --git a/src/plugins/mpi/pmi2/setup.c b/src/plugins/mpi/pmi2/setup.c index 30153ea30f3..de670ec2437 100644 --- a/src/plugins/mpi/pmi2/setup.c +++ b/src/plugins/mpi/pmi2/setup.c @@ -66,6 +66,7 @@ #include "pmi.h" #include "spawn.h" #include "kvs.h" +#include "ring.h" #define PMI2_SOCK_ADDR_FMT "/tmp/sock.pmi2.%u.%u" @@ -370,6 +371,12 @@ pmi2_setup_stepd(const stepd_step_rec_t *job, char ***env) if (rc != SLURM_SUCCESS) return rc; + /* TODO: finalize pmix_ring state somewhere */ + /* initialize pmix_ring state */ + rc = pmix_ring_init(&job_info, env); + if (rc != SLURM_SUCCESS) + return rc; + return SLURM_SUCCESS; } diff --git a/src/plugins/mpi/pmi2/tree.c b/src/plugins/mpi/pmi2/tree.c index 32f7e3f8fd9..ecac5dc51a8 100644 --- a/src/plugins/mpi/pmi2/tree.c +++ b/src/plugins/mpi/pmi2/tree.c @@ -57,6 +57,7 @@ #include "setup.h" #include "pmi.h" #include "nameserv.h" +#include "ring.h" static int _handle_kvs_fence(int fd, Buf buf); static int _handle_kvs_fence_resp(int fd, Buf buf); @@ -65,6 +66,8 @@ static int _handle_spawn_resp(int fd, Buf buf); static int _handle_name_publish(int fd, Buf buf); static int _handle_name_unpublish(int fd, Buf buf); static int _handle_name_lookup(int fd, Buf buf); +static int _handle_ring(int fd, Buf buf); +static int _handle_ring_resp(int fd, Buf buf); static uint32_t spawned_srun_ports_size = 0; static uint16_t *spawned_srun_ports = NULL; @@ -78,6 +81,8 @@ static int (*tree_cmd_handlers[]) (int fd, Buf buf) = { _handle_name_publish, _handle_name_unpublish, _handle_name_lookup, + _handle_ring, + _handle_ring_resp, NULL }; @@ -89,6 +94,8 @@ static char *tree_cmd_names[] = { "TREE_CMD_NAME_PUBLISH", "TREE_CMD_NAME_UNPUBLISH", "TREE_CMD_NAME_LOOKUP", + "TREE_CMD_RING", + "TREE_CMD_RING_RESP", NULL, }; @@ -501,6 +508,92 @@ unpack_error: goto out; } +/* handles ring_in message from one of our stepd children */ +static int +_handle_ring(int fd, Buf buf) +{ + uint32_t rank, count, temp32; + char *left = NULL; + char *right = NULL; + int ring_id; + int rc = SLURM_SUCCESS; + + debug3("mpi/pmi2: in _handle_ring"); + + /* TODO: do we need ntoh translation? */ + + /* data consists of: + * uint32_t rank - tree rank of stepd process that sent message + * uint32_t count - ring in count value + * string left - ring in left value + * string right - ring in right value */ + safe_unpack32(&rank, buf); + safe_unpack32(&count, buf); + safe_unpackstr_xmalloc(&left, &temp32, buf); + safe_unpackstr_xmalloc(&right, &temp32, buf); + + /* lookup ring_id for this child */ + ring_id = pmix_ring_id_by_rank(rank); + + /* check that we got a valid child id */ + if (ring_id == -1) { + error("mpi/pmi2: received ring_in message from unknown child %d", rank); + rc = SLURM_ERROR; + goto out; + } + + /* execute ring in operation */ + rc = pmix_ring_in(ring_id, count, left, right); + +out: + /* free strings unpacked from message */ + xfree(left); + xfree(right); + debug3("mpi/pmi2: out _handle_ring"); + return rc; + +unpack_error: + error("mpi/pmi2: failed to unpack ring in message"); + rc = SLURM_ERROR; + goto out; +} + +/* handles ring_out messages coming in from parent in stepd tree */ +static int +_handle_ring_resp(int fd, Buf buf) +{ + uint32_t count, temp32; + char *left = NULL; + char *right = NULL; + int rc = SLURM_SUCCESS; + + debug3("mpi/pmi2: in _handle_ring_resp"); + + /* TODO: need ntoh translation? */ + /* data consists of: + * uint32_t count - ring out count value + * string left - ring out left value + * string right - ring out right value */ + safe_unpack32(&count, buf); + safe_unpackstr_xmalloc(&left, &temp32, buf); + safe_unpackstr_xmalloc(&right, &temp32, buf); + + /* execute ring out operation */ + rc = pmix_ring_out(count, left, right); + +out: + /* free strings unpacked from message */ + xfree(left); + xfree(right); + debug3("mpi/pmi2: out _handle_ring_resp"); + return rc; + +unpack_error: + error("mpi/pmi2: failed to unpack ring out message"); + rc = SLURM_ERROR; + goto out; +} + /**************************************************************/ extern int handle_tree_cmd(int fd) diff --git a/src/plugins/mpi/pmi2/tree.h b/src/plugins/mpi/pmi2/tree.h index 6796ab4ec07..ccc254c235e 100644 --- a/src/plugins/mpi/pmi2/tree.h +++ b/src/plugins/mpi/pmi2/tree.h @@ -50,6 +50,8 @@ enum { TREE_CMD_NAME_PUBLISH, TREE_CMD_NAME_UNPUBLISH, TREE_CMD_NAME_LOOKUP, + TREE_CMD_RING, + TREE_CMD_RING_RESP, TREE_CMD_COUNT }; -- GitLab