From b974697e7a3dc0ec07e210f078a7bd87eb9a76ae Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 26 May 2006 23:30:01 +0000
Subject: [PATCH] Fix bug in PMI support that prevented use of second
 PMI_Barrier call.

---
 NEWS                            |  1 +
 src/api/slurm_pmi.c             | 59 +++++++-------------------
 testsuite/expect/test7.2        |  2 +-
 testsuite/expect/test7.2.prog.c | 75 +++++++++++++++++++++++++++++++--
 4 files changed, 90 insertions(+), 47 deletions(-)

diff --git a/NEWS b/NEWS
index 9ef885adb9a..1377200afda 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,7 @@ documents those changes that are of interest to users and admins.
  -- If a user breaks out of srun before the allocation takes place, mark the 
     job as CANCELLED rather than COMPLETED and change its start and end time 
     to that time.
+ -- Fix bug in PMI support that prevented use of second PMI_Barrier call.
 
 * Changes in SLURM 1.1.0
 ========================
diff --git a/src/api/slurm_pmi.c b/src/api/slurm_pmi.c
index 9b1c5036b6e..751eff28ee1 100644
--- a/src/api/slurm_pmi.c
+++ b/src/api/slurm_pmi.c
@@ -60,9 +60,8 @@ static int _get_addr(void)
 /* Transmit PMI Keyval space data */
 int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr)
 {
-	slurm_msg_t msg_send;//, msg_rcv;
+	slurm_msg_t msg_send;
 	int rc;
-	//List ret_list;
 
 	if (kvs_set_ptr == NULL)
 		return EINVAL;
@@ -78,24 +77,11 @@ int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr)
 	msg_send.forward_struct_init = 0;
 	
 	/* Send the RPC to the local srun communcation manager */
-	slurm_send_recv_rc_msg_only_one(&msg_send, &rc, 0);
-	/* ret_list = (List) slurm_send_recv_node_msg(&msg_send, &msg_rcv, 0); */
-
-/* 	if(!ret_list || errno != SLURM_SUCCESS) { */
-/* 		error("slurm_send_kvs_comm_set: %m"); */
-/* 		return SLURM_ERROR; */
-/* 	} */
-/* 	if(list_count(ret_list)>0) { */
-/* 		error("slurm_send_kvs_comm_set: " */
-/* 		      "got %d from receive, expecting 0", */
-/* 		      list_count(ret_list)); */
-/* 	} */
-/* 	list_destroy(ret_list); */
-	
-/* 	if (msg_rcv.msg_type != RESPONSE_SLURM_RC) */
-/* 		return SLURM_UNEXPECTED_MSG_ERROR; */
-/* 	rc = ((return_code_msg_t *) msg_rcv.data)->return_code; */
-/* 	slurm_free_return_code_msg((return_code_msg_t *) msg_rcv.data); */
+	if (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, 0) < 0) {
+		error("slurm_get_kvs_comm_set: %m");
+		return SLURM_ERROR;
+	}
+
 	return rc;
 }
 
@@ -105,7 +91,7 @@ int  slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr,
 {
 	int rc, srun_fd;
 	slurm_msg_t msg_send, msg_rcv;
-	slurm_addr slurm_addr;
+	slurm_addr slurm_addr, srun_reply_addr;
 	char hostname[64];
 	uint16_t port;
 	kvs_get_msg_t data;
@@ -151,43 +137,29 @@ int  slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr,
 	forward_init(&msg_send.forward, NULL);
 	msg_send.ret_list = NULL;
 	msg_send.forward_struct_init = 0;
-	
+
 	/* Send the RPC to the local srun communcation manager */
-	slurm_send_recv_rc_msg_only_one(&msg_send, &rc, 0);
-	/* ret_list = (List) slurm_send_recv_node_msg(&msg_send, &msg_rcv, 0); */
-
-/* 	if(!ret_list || errno != SLURM_SUCCESS) { */
-/* 		error("slurm_send_recv_node_msg: %m"); */
-/* 		return SLURM_ERROR; */
-/* 	} */
-/* 	if(list_count(ret_list)>0) { */
-/* 		error("slurm_send_recv_node_msg: " */
-/* 		      "got %d from receive, expecting 0", */
-/* 		      list_count(ret_list)); */
-/* 	} */
-/* 	list_destroy(ret_list); */
-	
-/* 	if (msg_rcv.msg_type != RESPONSE_SLURM_RC) { */
-/* 		error("slurm_get_kvs_comm_set msg_type=%d", msg_rcv.msg_type); */
-/* 		return SLURM_UNEXPECTED_MSG_ERROR; */
-/* 	} */
-/* 	rc = ((return_code_msg_t *) msg_rcv.data)->return_code; */
-/* 	slurm_free_return_code_msg((return_code_msg_t *) msg_rcv.data); */
+	if (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, 0) < 0) {
+		error("slurm_get_kvs_comm_set: %m");
+		return SLURM_ERROR;
+	}
 	if (rc != SLURM_SUCCESS) {
 		error("slurm_get_kvs_comm_set error_code=%d", rc);
 		return rc;
 	}
 
 	/* get the message after all tasks reach the barrier */
-	srun_fd = slurm_accept_msg_conn(pmi_fd, &srun_addr);
+	srun_fd = slurm_accept_msg_conn(pmi_fd, &srun_reply_addr);
 	if (srun_fd < 0) {
 		error("slurm_accept_msg_conn: %m");
 		return errno;
 	}
+
 	while ((ret_list = slurm_receive_msg(srun_fd, &msg_rcv, 0)) == NULL) {
 		if (errno == EINTR)
 			continue;
 		error("slurm_receive_msg: %m");
+		slurm_close_accepted_conn(srun_fd);
 		return errno;
 	}
 	if(ret_list) {
@@ -201,6 +173,7 @@ int  slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr,
 	msg_rcv.conn_fd = srun_fd;
 	if (msg_rcv.msg_type != PMI_KVS_GET_RESP) {
 		error("slurm_get_kvs_comm_set msg_type=%d", msg_rcv.msg_type);
+		slurm_close_accepted_conn(srun_fd);
 		return SLURM_UNEXPECTED_MSG_ERROR;
 	}
 	if (slurm_send_rc_msg(&msg_rcv, SLURM_SUCCESS) < 0)
diff --git a/testsuite/expect/test7.2 b/testsuite/expect/test7.2
index b3d3ce20edc..95879756ef8 100755
--- a/testsuite/expect/test7.2
+++ b/testsuite/expect/test7.2
@@ -62,7 +62,7 @@ if { [test_bluegene] } {
 	}
 }
 
-spawn $srun -l -N$node_cnt -n10 -O -t1 $file_prog_get
+spawn $srun -l -N$node_cnt -n6 -O -t1 $file_prog_get
 expect {
 	-re "FAILURE" {
 		send_user "\nFAILURE: some error occured\n"
diff --git a/testsuite/expect/test7.2.prog.c b/testsuite/expect/test7.2.prog.c
index f3a59fe2f7e..eda7cd39439 100644
--- a/testsuite/expect/test7.2.prog.c
+++ b/testsuite/expect/test7.2.prog.c
@@ -1,7 +1,7 @@
 /*****************************************************************************\
  *  test7.2.prog.c - Test of basic PMI library functionality
  *****************************************************************************
- *  Copyright (C) 2005 The Regents of the University of California.
+ *  Copyright (C) 2005-2006 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>
  *  UCRL-CODE-217948.
@@ -103,7 +103,8 @@ main (int argc, char **argv)
 	printf("PMI_KVS_Get_name_length_max = %d\n", kvs_name_len);
 	kvs_name = malloc(kvs_name_len);
 	if ((rc = PMI_KVS_Get_my_name(kvs_name, kvs_name_len)) != PMI_SUCCESS) {
-		printf("FAILURE: PMI_KVS_Get_my_name: %d, task %d\n", rc, pmi_rank);
+		printf("FAILURE: PMI_KVS_Get_my_name: %d, task %d\n", rc, 
+			pmi_rank);
 		exit(1);
 	}
 	printf("PMI_KVS_Get_my_name = %s\n", kvs_name);
@@ -150,7 +151,8 @@ main (int argc, char **argv)
 		exit(1);
 	}
 	printf("PMI_Barrier completed\n");
-	/* Tasks 0 and 1  only: Now lets get all keypairs and validate */
+
+	/* Tasks 0 and 1 only: Now lets get all keypairs and validate */
 	if (pmi_rank <= 1) {
 		for (i=0; i<pmi_size; i++) {
 			snprintf(key, key_len, "ATTR_1_%d", i);
@@ -211,6 +213,73 @@ main (int argc, char **argv)
 		}
 	}
 
+	/* Build some more key=val pairs */
+	snprintf(key, key_len, "ATTR_3_%d", procid);
+	snprintf(val, val_len, "C%d", procid+OFFSET_1);
+	if ((rc = PMI_KVS_Put(kvs_name, key, val)) != PMI_SUCCESS) {
+		printf("FAILURE: PMI_KVS_Put(%s,%s,%s): %d, task %d\n",
+			kvs_name, key, val, rc, pmi_rank);
+		exit(1);
+	}
+	printf("PMI_KVS_Put(%s,%s,%s)\n", kvs_name, key, val);
+	snprintf(key, key_len, "attr_4_%d", procid);
+	snprintf(val, val_len, "D%d", procid+OFFSET_2);
+	if ((rc = PMI_KVS_Put(kvs_name, key, val)) != PMI_SUCCESS) {
+		printf("FAILURE: PMI_KVS_Put(%s,%s,%s): %d, task %d\n",
+			kvs_name, key, val, rc, pmi_rank);
+		exit(1);
+	}
+	printf("PMI_KVS_Put(%s,%s,%s)\n", kvs_name, key, val);
+
+	/* Sync KVS across all tasks */
+	if ((rc = PMI_KVS_Commit(kvs_name)) != PMI_SUCCESS) {
+		printf("FAILURE: PMI_KVS_Commit: %d, task %d\n", rc, pmi_rank);
+		exit(1);
+	}
+	printf("PMI_KVS_Commit completed\n");
+
+	if ((rc = PMI_Barrier()) != PMI_SUCCESS) {
+		printf("FAILURE: PMI_Barrier: %d, task %d\n", rc, pmi_rank);
+		exit(1);
+	}
+	printf("PMI_Barrier completed\n");
+
+	/* Tasks 0 and 1 only: Now lets get some keypairs and validate */
+	if (pmi_rank <= 1) {
+		for (i=0; i<pmi_size; i++) {
+			snprintf(key, key_len, "ATTR_1_%d", i);
+			if ((rc = PMI_KVS_Get(kvs_name, key, val, val_len))
+					!= PMI_SUCCESS) {
+				printf("FAILURE: PMI_KVS_Get(%s): %d, task %d\n", 
+				key, rc, pmi_rank);
+				exit(1);
+			}
+			if ((val[0] != 'A')
+			||  ((atoi(&val[1])-OFFSET_1) != i)) {
+				printf("FAILURE: Bad keypair %s=%s, task %d\n",
+					key, val, pmi_rank);
+				exit(1);
+			}
+			printf("PMI_KVS_Get(%s,%s) %s\n", kvs_name, key, val);
+
+			snprintf(key, key_len, "attr_4_%d", i);
+			if ((rc = PMI_KVS_Get(kvs_name, key, val, val_len))
+					!= PMI_SUCCESS) {
+				printf("FAILURE: PMI_KVS_Get(%s): %d, task %d\n", 
+					key, rc, pmi_rank);
+				exit(1);
+			}
+			if ((val[0] != 'D')
+			||  ((atoi(&val[1])-OFFSET_2) != i)) {
+				printf("FAILURE: Bad keypair %s=%s, task %d\n",
+					key,val, pmi_rank);
+				exit(1);
+			}
+			printf("PMI_KVS_Get(%s,%s) %s\n", kvs_name, key, val);
+
+		}
+	}
+
 	/* create new keyspace and test it */
 	if ((rc = PMI_KVS_Create(kvs_name, kvs_name_len)) != PMI_SUCCESS) {
 		printf("FAILURE: PMI_KVS_Create: %d, task %d\n", rc, pmi_rank);
-- 
GitLab