From d2911e0a64d068830c8cf94d51d1057c34fb8c1e Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 24 Jul 2006 23:22:09 +0000 Subject: [PATCH] svn merge -r8670:8680 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1 --- NEWS | 2 +- src/slurmctld/agent.c | 35 +++++++++++++++++++++++++++++++-- src/srun/pmi.c | 18 +++++++++++++---- testsuite/expect/test7.2.prog.c | 8 ++++---- 4 files changed, 52 insertions(+), 11 deletions(-) diff --git a/NEWS b/NEWS index 08e42e6e23a..e41c089bebc 100644 --- a/NEWS +++ b/NEWS @@ -1783,7 +1783,7 @@ documents those changes that are of interest to users and admins. -- "fatal: _shm_unlock: Numerical result out of range" bug fixed in slurmd. -- Config file parsing is now case insensitive. -- SLURM_NODELIST environment variable now set in allocate mode. - + * Changes in SLURM 0.2.0-pre2 ============================= diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 78c36d2eadd..ef25a028b57 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -1208,15 +1208,30 @@ extern int agent_retry (int min_wait) int list_size = 0; time_t now = time(NULL); queued_request_t *queued_req_ptr = NULL; + agent_arg_t *agent_arg_ptr; + ListIterator retry_iter; if (retry_list) { static time_t last_msg_time = (time_t) 0; + uint32_t msg_type[5], i = 0; list_size = list_count(retry_list); if ((list_size > MAX_AGENT_CNT) && (difftime(now, last_msg_time) > 300)) { /* Note sizable backlog of work */ info("WARNING: agent retry_list size is %d", list_size); + retry_iter = list_iterator_create(retry_list); + while ((queued_req_ptr = (queued_request_t *) + list_next(retry_iter))) { + agent_arg_ptr = queued_req_ptr->agent_arg_ptr; + msg_type[i++] = agent_arg_ptr->msg_type; + if (i == 5) + break; + } + list_iterator_destroy(retry_iter); + info(" retry_list msg_type=%u,%u,%u,%u,%u", + msg_type[0], msg_type[1], msg_type[2], + msg_type[3], msg_type[4]); last_msg_time = now; } } @@ -1225,10 +1240,26 @@ extern int agent_retry (int min_wait) slurm_mutex_lock(&retry_mutex); if (retry_list) { - ListIterator retry_iter; + /* first try to find a new (never tried) record */ + + retry_iter = list_iterator_create(retry_list); + while ((queued_req_ptr = (queued_request_t *) + list_next(retry_iter))) { + if (queued_req_ptr->last_attempt == 0) + list_remove(retry_iter); + list_size--; + break; + } + list_iterator_destroy(retry_iter); + } + + if (retry_list && (queued_req_ptr == NULL)) { + /* now try to find a requeue request that is + * relatively old */ double age = 0; retry_iter = list_iterator_create(retry_list); + /* next try to find an older record to retry */ while ((queued_req_ptr = (queued_request_t *) list_next(retry_iter))) { age = difftime(now, queued_req_ptr->last_attempt); @@ -1243,7 +1274,7 @@ extern int agent_retry (int min_wait) slurm_mutex_unlock(&retry_mutex); if (queued_req_ptr) { - agent_arg_t *agent_arg_ptr = queued_req_ptr->agent_arg_ptr; + agent_arg_ptr = queued_req_ptr->agent_arg_ptr; xfree(queued_req_ptr); if (agent_arg_ptr) _spawn_retry_agent(agent_arg_ptr); diff --git a/src/srun/pmi.c b/src/srun/pmi.c index 289275d3cf4..bf40d48b71b 100644 --- a/src/srun/pmi.c +++ b/src/srun/pmi.c @@ -47,6 +47,7 @@ /* Global variables */ pthread_mutex_t kvs_mutex = PTHREAD_MUTEX_INITIALIZER; int kvs_comm_cnt = 0; +int kvs_updated = 0; struct kvs_comm **kvs_comm_ptr = NULL; struct barrier_resp { @@ -95,15 +96,23 @@ static void _kvs_xmit_tasks(void) #if _DEBUG info("All tasks at barrier, transmit KVS keypairs now"); #endif - /* copy the data */ + /* reset barrier info */ args = xmalloc(sizeof(struct agent_arg)); args->barrier_xmit_ptr = barrier_ptr; args->barrier_xmit_cnt = barrier_cnt; barrier_ptr = NULL; barrier_resp_cnt = 0; barrier_cnt = 0; - args->kvs_xmit_ptr = _kvs_comm_dup(); - args->kvs_xmit_cnt = kvs_comm_cnt; + + /* copy the new kvs data */ + if (kvs_updated) { + args->kvs_xmit_ptr = _kvs_comm_dup(); + args->kvs_xmit_cnt = kvs_comm_cnt; + kvs_updated = 0; + } else { /* No new data to transmit */ + args->kvs_xmit_ptr = xmalloc(0); + args->kvs_xmit_cnt = 0; + } /* Spawn a pthread to transmit it */ slurm_attr_init(&attr); @@ -316,12 +325,13 @@ extern int pmi_kvs_put(struct kvs_comm_set *kvs_set_ptr) _merge_named_kvs(kvs_ptr, kvs_set_ptr->kvs_comm_ptr[i]); } else { - _move_kvs(kvs_set_ptr-> kvs_comm_ptr[i]); + _move_kvs(kvs_set_ptr->kvs_comm_ptr[i]); kvs_set_ptr-> kvs_comm_ptr[i] = NULL; } } slurm_free_kvs_comm_set(kvs_set_ptr); _print_kvs(); + kvs_updated = 1; pthread_mutex_unlock(&kvs_mutex); return SLURM_SUCCESS; } diff --git a/testsuite/expect/test7.2.prog.c b/testsuite/expect/test7.2.prog.c index ad46a923498..a73c636b3fa 100644 --- a/testsuite/expect/test7.2.prog.c +++ b/testsuite/expect/test7.2.prog.c @@ -34,11 +34,11 @@ #else /* Typical MVAPICH2 use * - * Typically takes very long time for large task count - * adjust job time limit and timeout in test7.2 as needed + * Adjust job time limit and timeout in test7.2 as needed + * for large values. */ -# define BARRIER_CNT 7 -# define PUTS_PER_BARRIER 32 +# define BARRIER_CNT 4 +# define PUTS_PER_BARRIER 0 #endif #define OFFSET_1 1234 -- GitLab