From b974697e7a3dc0ec07e210f078a7bd87eb9a76ae Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 26 May 2006 23:30:01 +0000 Subject: [PATCH] Fix bug in PMI support that prevented use of second PMI_Barrier call. --- NEWS | 1 + src/api/slurm_pmi.c | 59 +++++++------------------- testsuite/expect/test7.2 | 2 +- testsuite/expect/test7.2.prog.c | 75 +++++++++++++++++++++++++++++++-- 4 files changed, 90 insertions(+), 47 deletions(-) diff --git a/NEWS b/NEWS index 9ef885adb9a..1377200afda 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,7 @@ documents those changes that are of interest to users and admins. -- If a user breaks out of srun before the allocation takes place, mark the job as CANCELLED rather than COMPLETED and change its start and end time to that time. + -- Fix bug in PMI support that prevented use of second PMI_Barrier call. * Changes in SLURM 1.1.0 ======================== diff --git a/src/api/slurm_pmi.c b/src/api/slurm_pmi.c index 9b1c5036b6e..751eff28ee1 100644 --- a/src/api/slurm_pmi.c +++ b/src/api/slurm_pmi.c @@ -60,9 +60,8 @@ static int _get_addr(void) /* Transmit PMI Keyval space data */ int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr) { - slurm_msg_t msg_send;//, msg_rcv; + slurm_msg_t msg_send; int rc; - //List ret_list; if (kvs_set_ptr == NULL) return EINVAL; @@ -78,24 +77,11 @@ int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr) msg_send.forward_struct_init = 0; /* Send the RPC to the local srun communcation manager */ - slurm_send_recv_rc_msg_only_one(&msg_send, &rc, 0); - /* ret_list = (List) slurm_send_recv_node_msg(&msg_send, &msg_rcv, 0); */ - -/* if(!ret_list || errno != SLURM_SUCCESS) { */ -/* error("slurm_send_kvs_comm_set: %m"); */ -/* return SLURM_ERROR; */ -/* } */ -/* if(list_count(ret_list)>0) { */ -/* error("slurm_send_kvs_comm_set: " */ -/* "got %d from receive, expecting 0", */ -/* list_count(ret_list)); */ -/* } */ -/* list_destroy(ret_list); */ - -/* if (msg_rcv.msg_type != RESPONSE_SLURM_RC) */ -/* return SLURM_UNEXPECTED_MSG_ERROR; */ -/* rc = ((return_code_msg_t *) msg_rcv.data)->return_code; */ -/* slurm_free_return_code_msg((return_code_msg_t *) msg_rcv.data); */ + if (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, 0) < 0) { + error("slurm_get_kvs_comm_set: %m"); + return SLURM_ERROR; + } + return rc; } @@ -105,7 +91,7 @@ int slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr, { int rc, srun_fd; slurm_msg_t msg_send, msg_rcv; - slurm_addr slurm_addr; + slurm_addr slurm_addr, srun_reply_addr; char hostname[64]; uint16_t port; kvs_get_msg_t data; @@ -151,43 +137,29 @@ int slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr, forward_init(&msg_send.forward, NULL); msg_send.ret_list = NULL; msg_send.forward_struct_init = 0; - + /* Send the RPC to the local srun communcation manager */ - slurm_send_recv_rc_msg_only_one(&msg_send, &rc, 0); - /* ret_list = (List) slurm_send_recv_node_msg(&msg_send, &msg_rcv, 0); */ - -/* if(!ret_list || errno != SLURM_SUCCESS) { */ -/* error("slurm_send_recv_node_msg: %m"); */ -/* return SLURM_ERROR; */ -/* } */ -/* if(list_count(ret_list)>0) { */ -/* error("slurm_send_recv_node_msg: " */ -/* "got %d from receive, expecting 0", */ -/* list_count(ret_list)); */ -/* } */ -/* list_destroy(ret_list); */ - -/* if (msg_rcv.msg_type != RESPONSE_SLURM_RC) { */ -/* error("slurm_get_kvs_comm_set msg_type=%d", msg_rcv.msg_type); */ -/* return SLURM_UNEXPECTED_MSG_ERROR; */ -/* } */ -/* rc = ((return_code_msg_t *) msg_rcv.data)->return_code; */ -/* slurm_free_return_code_msg((return_code_msg_t *) msg_rcv.data); */ + if (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, 0) < 0) { + error("slurm_get_kvs_comm_set: %m"); + return SLURM_ERROR; + } if (rc != SLURM_SUCCESS) { error("slurm_get_kvs_comm_set error_code=%d", rc); return rc; } /* get the message after all tasks reach the barrier */ - srun_fd = slurm_accept_msg_conn(pmi_fd, &srun_addr); + srun_fd = slurm_accept_msg_conn(pmi_fd, &srun_reply_addr); if (srun_fd < 0) { error("slurm_accept_msg_conn: %m"); return errno; } + while ((ret_list = slurm_receive_msg(srun_fd, &msg_rcv, 0)) == NULL) { if (errno == EINTR) continue; error("slurm_receive_msg: %m"); + slurm_close_accepted_conn(srun_fd); return errno; } if(ret_list) { @@ -201,6 +173,7 @@ int slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr, msg_rcv.conn_fd = srun_fd; if (msg_rcv.msg_type != PMI_KVS_GET_RESP) { error("slurm_get_kvs_comm_set msg_type=%d", msg_rcv.msg_type); + slurm_close_accepted_conn(srun_fd); return SLURM_UNEXPECTED_MSG_ERROR; } if (slurm_send_rc_msg(&msg_rcv, SLURM_SUCCESS) < 0) diff --git a/testsuite/expect/test7.2 b/testsuite/expect/test7.2 index b3d3ce20edc..95879756ef8 100755 --- a/testsuite/expect/test7.2 +++ b/testsuite/expect/test7.2 @@ -62,7 +62,7 @@ if { [test_bluegene] } { } } -spawn $srun -l -N$node_cnt -n10 -O -t1 $file_prog_get +spawn $srun -l -N$node_cnt -n6 -O -t1 $file_prog_get expect { -re "FAILURE" { send_user "\nFAILURE: some error occured\n" diff --git a/testsuite/expect/test7.2.prog.c b/testsuite/expect/test7.2.prog.c index f3a59fe2f7e..eda7cd39439 100644 --- a/testsuite/expect/test7.2.prog.c +++ b/testsuite/expect/test7.2.prog.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * test7.2.prog.c - Test of basic PMI library functionality ***************************************************************************** - * Copyright (C) 2005 The Regents of the University of California. + * Copyright (C) 2005-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * UCRL-CODE-217948. @@ -103,7 +103,8 @@ main (int argc, char **argv) printf("PMI_KVS_Get_name_length_max = %d\n", kvs_name_len); kvs_name = malloc(kvs_name_len); if ((rc = PMI_KVS_Get_my_name(kvs_name, kvs_name_len)) != PMI_SUCCESS) { - printf("FAILURE: PMI_KVS_Get_my_name: %d, task %d\n", rc, pmi_rank); + printf("FAILURE: PMI_KVS_Get_my_name: %d, task %d\n", rc, + pmi_rank); exit(1); } printf("PMI_KVS_Get_my_name = %s\n", kvs_name); @@ -150,7 +151,8 @@ main (int argc, char **argv) exit(1); } printf("PMI_Barrier completed\n"); - /* Tasks 0 and 1 only: Now lets get all keypairs and validate */ + + /* Tasks 0 and 1 only: Now lets get all keypairs and validate */ if (pmi_rank <= 1) { for (i=0; i<pmi_size; i++) { snprintf(key, key_len, "ATTR_1_%d", i); @@ -211,6 +213,73 @@ main (int argc, char **argv) } } + /* Build some more key=val pairs */ + snprintf(key, key_len, "ATTR_3_%d", procid); + snprintf(val, val_len, "C%d", procid+OFFSET_1); + if ((rc = PMI_KVS_Put(kvs_name, key, val)) != PMI_SUCCESS) { + printf("FAILURE: PMI_KVS_Put(%s,%s,%s): %d, task %d\n", + kvs_name, key, val, rc, pmi_rank); + exit(1); + } + printf("PMI_KVS_Put(%s,%s,%s)\n", kvs_name, key, val); + snprintf(key, key_len, "attr_4_%d", procid); + snprintf(val, val_len, "D%d", procid+OFFSET_2); + if ((rc = PMI_KVS_Put(kvs_name, key, val)) != PMI_SUCCESS) { + printf("FAILURE: PMI_KVS_Put(%s,%s,%s): %d, task %d\n", + kvs_name, key, val, rc, pmi_rank); + exit(1); + } + printf("PMI_KVS_Put(%s,%s,%s)\n", kvs_name, key, val); + + /* Sync KVS across all tasks */ + if ((rc = PMI_KVS_Commit(kvs_name)) != PMI_SUCCESS) { + printf("FAILURE: PMI_KVS_Commit: %d, task %d\n", rc, pmi_rank); + exit(1); + } + printf("PMI_KVS_Commit completed\n"); + + if ((rc = PMI_Barrier()) != PMI_SUCCESS) { + printf("FAILURE: PMI_Barrier: %d, task %d\n", rc, pmi_rank); + exit(1); + } + printf("PMI_Barrier completed\n"); + + /* Tasks 0 and 1 only: Now lets get some keypairs and validate */ + if (pmi_rank <= 1) { + for (i=0; i<pmi_size; i++) { + snprintf(key, key_len, "ATTR_1_%d", i); + if ((rc = PMI_KVS_Get(kvs_name, key, val, val_len)) + != PMI_SUCCESS) { + printf("FAILURE: PMI_KVS_Get(%s): %d, task %d\n", + key, rc, pmi_rank); + exit(1); + } + if ((val[0] != 'A') + || ((atoi(&val[1])-OFFSET_1) != i)) { + printf("FAILURE: Bad keypair %s=%s, task %d\n", + key, val, pmi_rank); + exit(1); + } + printf("PMI_KVS_Get(%s,%s) %s\n", kvs_name, key, val); + + snprintf(key, key_len, "attr_4_%d", i); + if ((rc = PMI_KVS_Get(kvs_name, key, val, val_len)) + != PMI_SUCCESS) { + printf("FAILURE: PMI_KVS_Get(%s): %d, task %d\n", + key, rc, pmi_rank); + exit(1); + } + if ((val[0] != 'D') + || ((atoi(&val[1])-OFFSET_2) != i)) { + printf("FAILURE: Bad keypair %s=%s, task %d\n", + key,val, pmi_rank); + exit(1); + } + printf("PMI_KVS_Get(%s,%s) %s\n", kvs_name, key, val); + + } + } + /* create new keyspace and test it */ if ((rc = PMI_KVS_Create(kvs_name, kvs_name_len)) != PMI_SUCCESS) { printf("FAILURE: PMI_KVS_Create: %d, task %d\n", rc, pmi_rank); -- GitLab