From b19e67ff3e910d3b5789be19c9c69a8a723cb907 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 20 Feb 2009 23:17:05 +0000
Subject: [PATCH] Add new srun argument for OpenMPI, --resv-ports

---
 doc/man/man1/srun.1              |  5 ++++
 slurm/slurm.h.in                 | 45 ++++++++++++++++----------------
 src/api/step_ctx.c               |  3 ++-
 src/common/slurm_protocol_defs.h |  1 +
 src/common/slurm_protocol_pack.c |  2 ++
 src/srun/allocate.c              |  2 ++
 src/srun/opt.c                   | 23 +++++++++++++++-
 src/srun/opt.h                   |  1 +
 8 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1
index 28a8274e648..ff17de6174c 100644
--- a/doc/man/man1/srun.1
+++ b/doc/man/man1/srun.1
@@ -877,6 +877,11 @@ value of \fB\-\-nodes\fR exceeds the number of nodes identified
 with the \fB\-\-relative\fR option, a warning message will be 
 printed and the \fB\-\-relative\fR option will take precedence.
 
+.TP
+\fB\-\-resv-ports\fR
+Reserve communication ports for this job.
+Used for OpenMPI.
+
 .TP
 \fB\-\-reservation\fR=\fIname\fR
 Allocate resources for the job from the named reservation.
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 3ef2a3454ef..bdfa17699d9 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -765,38 +765,39 @@ typedef struct srun_user_msg {
 
 typedef struct srun_node_fail_msg {
 	uint32_t job_id;	/* slurm job_id */
-	uint32_t step_id;	/* step_id or NO_VAL */
 	char *nodelist;		/* name of failed node(s) */
+	uint32_t step_id;	/* step_id or NO_VAL */
 } srun_node_fail_msg_t;
 
 typedef struct {
+	uint16_t ckpt_interval;	/* checkpoint interval in minutes */
+	char *ckpt_path;	/* path to store checkpoint image files */
+	uint32_t cpu_count;	/* number of required processors */
+	uint16_t exclusive;	/* 1 if CPUs not shared with other steps */
+	uint16_t immediate;	/* 1 if allocate to run or fail immediately, 
+				 * 0 if to be queued awaiting resources */
 	uint32_t job_id;	/* job ID */
-	uid_t uid;
+	uint32_t mem_per_task;	/* memory required per task (MB), 
+				 * use job limit if 0 */
 	char *name;		/* name of the job step */
+	char *network;		/* network use spec */
+	uint8_t no_kill;	/* 1 if no kill on node failure */
 	uint32_t node_count;	/* number of required nodes */
-	uint32_t cpu_count;	/* number of required processors */
-	uint32_t task_count;	/* number of tasks required */
-	uint16_t relative;	/* first node to use of job's allocation */
-	uint16_t task_dist;	/* see enum task_dist_state, default
-				   is SLURM_DIST_CYCLIC */
-	uint16_t plane_size;	/* plane size when task_dist =
-				   SLURM_DIST_PLANE */
 	char *node_list;	/* list of required nodes */
-	char *network;		/* network use spec */
-	uint16_t immediate;	/* 1 if allocate to run or fail immediately, 
-				 * 0 if to be queued awaiting resources */
-	uint16_t exclusive;	/* 1 if CPUs not shared with other steps */
 	bool overcommit;	/* "true" to allow the allocation of more tasks
-				   to a node than available processors,
-				   "false" to accept at most one task per
-				   processor. "false" by default. */
-	uint8_t no_kill;	/* 1 if no kill on node failure */
-	uint16_t ckpt_interval;	/* checkpoint interval in minutes */
-	char *ckpt_path;	/* path to store checkpoint image files */
+				 * to a node than available processors,
+				 * "false" to accept at most one task per
+				 * processor. "false" by default. */
+	uint16_t plane_size;	/* plane size when task_dist =
+				 * SLURM_DIST_PLANE */
+	uint16_t relative;	/* first node to use of job's allocation */
+	uint8_t  resv_ports;	/* reserve ports if set */
+	uint32_t task_count;	/* number of tasks required */
+	uint16_t task_dist;	/* see enum task_dist_state, default
+				 * is SLURM_DIST_CYCLIC */
+	uid_t uid;		/* user ID */
 	uint16_t verbose_level; /* for extra logging decisions in step
-				   launch api */
-	uint32_t mem_per_task;	/* memory required per task (MB), 
-				 * use job limit if 0 */
+				 * launch api */
 } slurm_step_ctx_params_t;
 
 typedef struct {
diff --git a/src/api/step_ctx.c b/src/api/step_ctx.c
index 1c6f71ba395..d0db582e600 100644
--- a/src/api/step_ctx.c
+++ b/src/api/step_ctx.c
@@ -2,7 +2,7 @@
  *  step_ctx.c - step_ctx task functions for use by AIX/POE
  *****************************************************************************
  *  Copyright (C) 2004-2007 The Regents of the University of California.
- *  Copyright (C) 2008 Lawrence Livermore National Security.
+ *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>.
  *  CODE-OCEC-09-009. All rights reserved.
@@ -90,6 +90,7 @@ static job_step_create_request_msg_t *_create_step_request(
 	step_req->cpu_count = step_params->cpu_count;
 	step_req->num_tasks = step_params->task_count;
 	step_req->relative = step_params->relative;
+	step_req->resv_ports = step_params->resv_ports;
 	step_req->exclusive  = step_params->exclusive;
 	step_req->immediate  = step_params->immediate;
 	step_req->ckpt_interval = step_params->ckpt_interval;
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index ecece41a7e2..03ad207dbe9 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -466,6 +466,7 @@ typedef struct job_step_specs {
 				   SLURM_DIST_PLANE */
 	uint16_t port;		/* port to contact initiating srun */
 	uint16_t relative;	/* first node to use of job's allocation */
+	uint8_t resv_ports;	/* reserve ports for MPI if set */
 	uint16_t task_dist;	/* see enum task_dist_state */
 	uint32_t user_id;	/* user the job runs as */
 } job_step_create_request_msg_t;
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index 9623ef69148..044fcf1c2a6 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -1941,6 +1941,7 @@ _pack_job_step_create_request_msg(job_step_create_request_msg_t
 
 	pack8(msg->no_kill, buffer);
 	pack8(msg->overcommit, buffer);
+	pack8(msg->resv_ports, buffer);
 }
 
 static int
@@ -1978,6 +1979,7 @@ _unpack_job_step_create_request_msg(job_step_create_request_msg_t ** msg,
 
 	safe_unpack8(&(tmp_ptr->no_kill), buffer);
 	safe_unpack8(&(tmp_ptr->overcommit), buffer);
+	safe_unpack8(&(tmp_ptr->resv_ports), buffer);
 
 	return SLURM_SUCCESS;
 
diff --git a/src/srun/allocate.c b/src/srun/allocate.c
index d26100b2014..4f564c7e7e5 100644
--- a/src/srun/allocate.c
+++ b/src/srun/allocate.c
@@ -538,6 +538,8 @@ create_job_step(srun_job_t *job, bool use_all_cpus)
 	job->ctx_params.exclusive = (uint16_t)opt.exclusive;
 	job->ctx_params.immediate = (uint16_t)opt.immediate;
 	job->ctx_params.verbose_level = (uint16_t)_verbose;
+	job->ctx_params.resv_ports = (uint8_t) opt.resv_ports;
+
 	switch (opt.distribution) {
 	case SLURM_DIST_BLOCK:
 	case SLURM_DIST_ARBITRARY:
diff --git a/src/srun/opt.c b/src/srun/opt.c
index 10f3e0ca149..280c546d972 100644
--- a/src/srun/opt.c
+++ b/src/srun/opt.c
@@ -99,6 +99,7 @@
 #define OPT_OVERCOMMIT  0x06
 #define OPT_CORE        0x07
 #define OPT_CONN_TYPE	0x08
+#define OPT_RESV_PORTS	0x09
 #define OPT_NO_ROTATE	0x0a
 #define OPT_GEOMETRY	0x0b
 #define OPT_MPI         0x0c
@@ -127,7 +128,8 @@
 #define LONG_OPT_UID         0x10a
 #define LONG_OPT_GID         0x10b
 #define LONG_OPT_MPI         0x10c
-#define LONG_OPT_CORE	     0x10e
+#define LONG_OPT_RESV_PORTS  0x10d
+#define LONG_OPT_CORE        0x10e
 #define LONG_OPT_DEBUG_TS    0x110
 #define LONG_OPT_CONNTYPE    0x111
 #define LONG_OPT_TEST_ONLY   0x113
@@ -344,6 +346,7 @@ static void _opt_default()
 
 	opt.relative = NO_VAL;
 	opt.relative_set = false;
+	opt.resv_ports = 0;
 	opt.cmd_name = NULL;
 	opt.job_name = NULL;
 	opt.job_name_set_cmd = false;
@@ -489,6 +492,7 @@ env_vars_t env_vars[] = {
 {"SLURM_PARTITION",     OPT_STRING,     &opt.partition,     NULL             },
 {"SLURM_RAMDISK_IMAGE", OPT_STRING,     &opt.ramdiskimage,  NULL             },
 {"SLURM_REMOTE_CWD",    OPT_STRING,     &opt.cwd,           NULL             },
+{"SLURM_RESV_PORTS",    OPT_RESV_PORTS, NULL,               NULL             },
 {"SLURM_STDERRMODE",    OPT_STRING,     &opt.efname,        NULL             },
 {"SLURM_STDINMODE",     OPT_STRING,     &opt.ifname,        NULL             },
 {"SLURM_STDOUTMODE",    OPT_STRING,     &opt.ofname,        NULL             },
@@ -602,6 +606,13 @@ _process_env_var(env_vars_t *e, const char *val)
 		opt.shared = 0;
 		break;
 
+	case OPT_RESV_PORTS:
+		if (val)
+			opt.resv_ports = strtol(val, NULL, 10);
+		else
+			opt.resv_ports = 1;
+		break;
+
 	case OPT_OPEN_MODE:
 		if ((val[0] == 'a') || (val[0] == 'A'))
 			opt.open_mode = OPEN_MODE_APPEND;
@@ -730,6 +741,7 @@ static void set_options(const int argc, char **argv)
 		{"mem-per-cpu",      required_argument, 0, LONG_OPT_MEM_PER_CPU},
 		{"hint",             required_argument, 0, LONG_OPT_HINT},
 		{"mpi",              required_argument, 0, LONG_OPT_MPI},
+		{"resv-ports",       optional_argument, 0, LONG_OPT_RESV_PORTS},
 		{"tmp",              required_argument, 0, LONG_OPT_TMP},
 		{"jobid",            required_argument, 0, LONG_OPT_JOBID},
 		{"msg-timeout",      required_argument, 0, LONG_OPT_TIMEO},
@@ -1069,6 +1081,12 @@ static void set_options(const int argc, char **argv)
 				      optarg);
 			}
 			break;
+		case LONG_OPT_RESV_PORTS:
+			if (optarg)
+				opt.resv_ports = strtol(optarg, NULL, 10);
+			else
+				opt.resv_ports = 1;
+			break;
 		case LONG_OPT_TMP:
 			opt.job_min_tmp_disk = str_to_bytes(optarg);
 			if (opt.job_min_tmp_disk < 0) {
@@ -1928,6 +1946,7 @@ static void _opt_list()
 	info("ntasks-per-socket : %d", opt.ntasks_per_socket);
 	info("ntasks-per-core   : %d", opt.ntasks_per_core);
 	info("plane_size        : %u", opt.plane_size);
+	info("resv_ports        : %u", opt.resv_ports);
 	str = print_commandline(opt.argc, opt.argv);
 	info("remote command    : `%s'", str);
 	xfree(str);
@@ -1965,6 +1984,7 @@ static void _usage(void)
 		"            [--mail-type=type] [--mail-user=user] [--nice[=value]]\n"
 		"            [--prolog=fname] [--epilog=fname] [--multi-prog]\n"
 		"            [--task-prolog=fname] [--task-epilog=fname]\n"
+		"            [--resv-ports]\n"
 		"            [-w hosts...] [-x hosts...] executable [args...]\n");
 }
 
@@ -2057,6 +2077,7 @@ static void _help(void)
 "      --mem-per-cpu=MB        maximum amount of real memory per allocated\n"
 "                              CPU required by the job.\n" 
 "                              --mem >= --mem-per-cpu if --mem is specified.\n" 
+"      --resv-ports            reserve communication ports\n" 
 "\n"
 "Affinity/Multi-core options: (when the task/affinity plugin is enabled)\n" 
 "  -B --extra-node-info=S[:C[:T]]            Expands to:\n"
diff --git a/src/srun/opt.h b/src/srun/opt.h
index b38483192b6..cd68a770ea2 100644
--- a/src/srun/opt.h
+++ b/src/srun/opt.h
@@ -115,6 +115,7 @@ typedef struct srun_options {
 	char *ckpt_interval_str;/* --checkpoint (string)	*/
 	char *ckpt_path;	/* --checkpoint-path (string)   */
 	bool exclusive;		/* --exclusive			*/
+	int  resv_ports;	/* --resv_ports			*/
 	char *partition;	/* --partition=n,   -p n   	*/
 	enum task_dist_states
 	        distribution;	/* --distribution=, -m dist	*/
-- 
GitLab