From 510780bd70a01ff59bce87dd7944a062fa9cc284 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 11 Nov 2005 23:30:33 +0000 Subject: [PATCH] Update code to check reply to KVS xmit for errors. --- src/srun/pmi.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/src/srun/pmi.c b/src/srun/pmi.c index 05987e3e5ef..d8a7dbcdeea 100644 --- a/src/srun/pmi.c +++ b/src/srun/pmi.c @@ -39,7 +39,9 @@ #include "src/common/xstring.h" #include "src/common/xmalloc.h" -#define _DEBUG 0 +#define _DEBUG 0 /* non-zero for extra KVS logging */ +#define MSG_RE_TRANSMIT 1 /* re-transmit KVS messages this number of times */ +#define MSG_PARALLELISM 50 /* count of simultaneous KVS message threads */ /* Global variables */ pthread_mutex_t kvs_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -49,17 +51,18 @@ struct kvs_comm **kvs_comm_ptr = NULL; struct barrier_resp { uint16_t port; char *hostname; -}; +}; /* details for barrier task communcations */ struct barrier_resp *barrier_ptr = NULL; -uint16_t barrier_resp_cnt = 0; -uint16_t barrier_cnt = 0; +uint16_t barrier_resp_cnt = 0; /* tasks having reached barrier */ +uint16_t barrier_cnt = 0; /* tasks needing to reach barrier */ struct agent_arg { struct barrier_resp *barrier_xmit_ptr; int barrier_xmit_cnt; struct kvs_comm **kvs_xmit_ptr; int kvs_xmit_cnt; -}; +}; /* details for message agent manager */ +int agent_cnt = 0; /* number of active message agents */ static void *_agent(void *x); static struct kvs_comm *_find_kvs_by_name(char *name); @@ -106,14 +109,17 @@ static void *_agent(void *x) struct agent_arg *args = (struct agent_arg *) x; struct kvs_comm_set kvs_set; int i, j, rc, task_fd; + int success_xmit = 0, retries = 0; slurm_msg_t msg_send, msg_rcv; - /* send the message */ + /* send the messages */ kvs_set.kvs_comm_recs = args->kvs_xmit_cnt; kvs_set.kvs_comm_ptr = args->kvs_xmit_ptr; msg_send.msg_type = PMI_KVS_GET_RESP; msg_send.data = (void *) &kvs_set; for (i=0; i<args->barrier_xmit_cnt; i++) { + if (args->barrier_xmit_ptr[i].port == 0) + continue; debug2("KVS_Barrier msg to %s:%u", args->barrier_xmit_ptr[i].hostname, args->barrier_xmit_ptr[i].port); @@ -136,8 +142,26 @@ static void *_agent(void *x) if (rc < 0) { error("KVS_Barrier confirm fail from %s", args->barrier_xmit_ptr[i].hostname); + continue; + } + if (msg_rcv.msg_type != RESPONSE_SLURM_RC) { + error("KVS_Barrier confirm type %d from %s", + msg_rcv.msg_type, + args->barrier_xmit_ptr[i].hostname); + continue; + } + rc = ((return_code_msg_t *) msg_rcv.data)->return_code; + slurm_free_return_code_msg((return_code_msg_t *) msg_rcv.data); + if (rc != SLURM_SUCCESS) { + error("KVS_Barrier confirm from %s, rc=%d", + args->barrier_xmit_ptr[i].hostname, rc); + } else { + /* successfully transmitted KVS keypairs */ + args->barrier_xmit_ptr[i].port = 0; + success_xmit++; } } + /* Release allocated memory */ for (i=0; i<args->barrier_xmit_cnt; i++) -- GitLab