From b19e67ff3e910d3b5789be19c9c69a8a723cb907 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 20 Feb 2009 23:17:05 +0000 Subject: [PATCH] Add new srun argument for OpenMPI, --resv-ports --- doc/man/man1/srun.1 | 5 ++++ slurm/slurm.h.in | 45 ++++++++++++++++---------------- src/api/step_ctx.c | 3 ++- src/common/slurm_protocol_defs.h | 1 + src/common/slurm_protocol_pack.c | 2 ++ src/srun/allocate.c | 2 ++ src/srun/opt.c | 23 +++++++++++++++- src/srun/opt.h | 1 + 8 files changed, 58 insertions(+), 24 deletions(-) diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 28a8274e648..ff17de6174c 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -877,6 +877,11 @@ value of \fB\-\-nodes\fR exceeds the number of nodes identified with the \fB\-\-relative\fR option, a warning message will be printed and the \fB\-\-relative\fR option will take precedence. +.TP +\fB\-\-resv-ports\fR +Reserve communication ports for this job. +Used for OpenMPI. + .TP \fB\-\-reservation\fR=\fIname\fR Allocate resources for the job from the named reservation. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 3ef2a3454ef..bdfa17699d9 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -765,38 +765,39 @@ typedef struct srun_user_msg { typedef struct srun_node_fail_msg { uint32_t job_id; /* slurm job_id */ - uint32_t step_id; /* step_id or NO_VAL */ char *nodelist; /* name of failed node(s) */ + uint32_t step_id; /* step_id or NO_VAL */ } srun_node_fail_msg_t; typedef struct { + uint16_t ckpt_interval; /* checkpoint interval in minutes */ + char *ckpt_path; /* path to store checkpoint image files */ + uint32_t cpu_count; /* number of required processors */ + uint16_t exclusive; /* 1 if CPUs not shared with other steps */ + uint16_t immediate; /* 1 if allocate to run or fail immediately, + * 0 if to be queued awaiting resources */ uint32_t job_id; /* job ID */ - uid_t uid; + uint32_t mem_per_task; /* memory required per task (MB), + * use job limit if 0 */ char *name; /* name of the job step */ + char *network; /* network use spec */ + uint8_t no_kill; /* 1 if no kill on node failure */ uint32_t node_count; /* number of required nodes */ - uint32_t cpu_count; /* number of required processors */ - uint32_t task_count; /* number of tasks required */ - uint16_t relative; /* first node to use of job's allocation */ - uint16_t task_dist; /* see enum task_dist_state, default - is SLURM_DIST_CYCLIC */ - uint16_t plane_size; /* plane size when task_dist = - SLURM_DIST_PLANE */ char *node_list; /* list of required nodes */ - char *network; /* network use spec */ - uint16_t immediate; /* 1 if allocate to run or fail immediately, - * 0 if to be queued awaiting resources */ - uint16_t exclusive; /* 1 if CPUs not shared with other steps */ bool overcommit; /* "true" to allow the allocation of more tasks - to a node than available processors, - "false" to accept at most one task per - processor. "false" by default. */ - uint8_t no_kill; /* 1 if no kill on node failure */ - uint16_t ckpt_interval; /* checkpoint interval in minutes */ - char *ckpt_path; /* path to store checkpoint image files */ + * to a node than available processors, + * "false" to accept at most one task per + * processor. "false" by default. */ + uint16_t plane_size; /* plane size when task_dist = + * SLURM_DIST_PLANE */ + uint16_t relative; /* first node to use of job's allocation */ + uint8_t resv_ports; /* reserve ports if set */ + uint32_t task_count; /* number of tasks required */ + uint16_t task_dist; /* see enum task_dist_state, default + * is SLURM_DIST_CYCLIC */ + uid_t uid; /* user ID */ uint16_t verbose_level; /* for extra logging decisions in step - launch api */ - uint32_t mem_per_task; /* memory required per task (MB), - * use job limit if 0 */ + * launch api */ } slurm_step_ctx_params_t; typedef struct { diff --git a/src/api/step_ctx.c b/src/api/step_ctx.c index 1c6f71ba395..d0db582e600 100644 --- a/src/api/step_ctx.c +++ b/src/api/step_ctx.c @@ -2,7 +2,7 @@ * step_ctx.c - step_ctx task functions for use by AIX/POE ***************************************************************************** * Copyright (C) 2004-2007 The Regents of the University of California. - * Copyright (C) 2008 Lawrence Livermore National Security. + * Copyright (C) 2008-2009 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov>. * CODE-OCEC-09-009. All rights reserved. @@ -90,6 +90,7 @@ static job_step_create_request_msg_t *_create_step_request( step_req->cpu_count = step_params->cpu_count; step_req->num_tasks = step_params->task_count; step_req->relative = step_params->relative; + step_req->resv_ports = step_params->resv_ports; step_req->exclusive = step_params->exclusive; step_req->immediate = step_params->immediate; step_req->ckpt_interval = step_params->ckpt_interval; diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index ecece41a7e2..03ad207dbe9 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -466,6 +466,7 @@ typedef struct job_step_specs { SLURM_DIST_PLANE */ uint16_t port; /* port to contact initiating srun */ uint16_t relative; /* first node to use of job's allocation */ + uint8_t resv_ports; /* reserve ports for MPI if set */ uint16_t task_dist; /* see enum task_dist_state */ uint32_t user_id; /* user the job runs as */ } job_step_create_request_msg_t; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 9623ef69148..044fcf1c2a6 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1941,6 +1941,7 @@ _pack_job_step_create_request_msg(job_step_create_request_msg_t pack8(msg->no_kill, buffer); pack8(msg->overcommit, buffer); + pack8(msg->resv_ports, buffer); } static int @@ -1978,6 +1979,7 @@ _unpack_job_step_create_request_msg(job_step_create_request_msg_t ** msg, safe_unpack8(&(tmp_ptr->no_kill), buffer); safe_unpack8(&(tmp_ptr->overcommit), buffer); + safe_unpack8(&(tmp_ptr->resv_ports), buffer); return SLURM_SUCCESS; diff --git a/src/srun/allocate.c b/src/srun/allocate.c index d26100b2014..4f564c7e7e5 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -538,6 +538,8 @@ create_job_step(srun_job_t *job, bool use_all_cpus) job->ctx_params.exclusive = (uint16_t)opt.exclusive; job->ctx_params.immediate = (uint16_t)opt.immediate; job->ctx_params.verbose_level = (uint16_t)_verbose; + job->ctx_params.resv_ports = (uint8_t) opt.resv_ports; + switch (opt.distribution) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: diff --git a/src/srun/opt.c b/src/srun/opt.c index 10f3e0ca149..280c546d972 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -99,6 +99,7 @@ #define OPT_OVERCOMMIT 0x06 #define OPT_CORE 0x07 #define OPT_CONN_TYPE 0x08 +#define OPT_RESV_PORTS 0x09 #define OPT_NO_ROTATE 0x0a #define OPT_GEOMETRY 0x0b #define OPT_MPI 0x0c @@ -127,7 +128,8 @@ #define LONG_OPT_UID 0x10a #define LONG_OPT_GID 0x10b #define LONG_OPT_MPI 0x10c -#define LONG_OPT_CORE 0x10e +#define LONG_OPT_RESV_PORTS 0x10d +#define LONG_OPT_CORE 0x10e #define LONG_OPT_DEBUG_TS 0x110 #define LONG_OPT_CONNTYPE 0x111 #define LONG_OPT_TEST_ONLY 0x113 @@ -344,6 +346,7 @@ static void _opt_default() opt.relative = NO_VAL; opt.relative_set = false; + opt.resv_ports = 0; opt.cmd_name = NULL; opt.job_name = NULL; opt.job_name_set_cmd = false; @@ -489,6 +492,7 @@ env_vars_t env_vars[] = { {"SLURM_PARTITION", OPT_STRING, &opt.partition, NULL }, {"SLURM_RAMDISK_IMAGE", OPT_STRING, &opt.ramdiskimage, NULL }, {"SLURM_REMOTE_CWD", OPT_STRING, &opt.cwd, NULL }, +{"SLURM_RESV_PORTS", OPT_RESV_PORTS, NULL, NULL }, {"SLURM_STDERRMODE", OPT_STRING, &opt.efname, NULL }, {"SLURM_STDINMODE", OPT_STRING, &opt.ifname, NULL }, {"SLURM_STDOUTMODE", OPT_STRING, &opt.ofname, NULL }, @@ -602,6 +606,13 @@ _process_env_var(env_vars_t *e, const char *val) opt.shared = 0; break; + case OPT_RESV_PORTS: + if (val) + opt.resv_ports = strtol(val, NULL, 10); + else + opt.resv_ports = 1; + break; + case OPT_OPEN_MODE: if ((val[0] == 'a') || (val[0] == 'A')) opt.open_mode = OPEN_MODE_APPEND; @@ -730,6 +741,7 @@ static void set_options(const int argc, char **argv) {"mem-per-cpu", required_argument, 0, LONG_OPT_MEM_PER_CPU}, {"hint", required_argument, 0, LONG_OPT_HINT}, {"mpi", required_argument, 0, LONG_OPT_MPI}, + {"resv-ports", optional_argument, 0, LONG_OPT_RESV_PORTS}, {"tmp", required_argument, 0, LONG_OPT_TMP}, {"jobid", required_argument, 0, LONG_OPT_JOBID}, {"msg-timeout", required_argument, 0, LONG_OPT_TIMEO}, @@ -1069,6 +1081,12 @@ static void set_options(const int argc, char **argv) optarg); } break; + case LONG_OPT_RESV_PORTS: + if (optarg) + opt.resv_ports = strtol(optarg, NULL, 10); + else + opt.resv_ports = 1; + break; case LONG_OPT_TMP: opt.job_min_tmp_disk = str_to_bytes(optarg); if (opt.job_min_tmp_disk < 0) { @@ -1928,6 +1946,7 @@ static void _opt_list() info("ntasks-per-socket : %d", opt.ntasks_per_socket); info("ntasks-per-core : %d", opt.ntasks_per_core); info("plane_size : %u", opt.plane_size); + info("resv_ports : %u", opt.resv_ports); str = print_commandline(opt.argc, opt.argv); info("remote command : `%s'", str); xfree(str); @@ -1965,6 +1984,7 @@ static void _usage(void) " [--mail-type=type] [--mail-user=user] [--nice[=value]]\n" " [--prolog=fname] [--epilog=fname] [--multi-prog]\n" " [--task-prolog=fname] [--task-epilog=fname]\n" + " [--resv-ports]\n" " [-w hosts...] [-x hosts...] executable [args...]\n"); } @@ -2057,6 +2077,7 @@ static void _help(void) " --mem-per-cpu=MB maximum amount of real memory per allocated\n" " CPU required by the job.\n" " --mem >= --mem-per-cpu if --mem is specified.\n" +" --resv-ports reserve communication ports\n" "\n" "Affinity/Multi-core options: (when the task/affinity plugin is enabled)\n" " -B --extra-node-info=S[:C[:T]] Expands to:\n" diff --git a/src/srun/opt.h b/src/srun/opt.h index b38483192b6..cd68a770ea2 100644 --- a/src/srun/opt.h +++ b/src/srun/opt.h @@ -115,6 +115,7 @@ typedef struct srun_options { char *ckpt_interval_str;/* --checkpoint (string) */ char *ckpt_path; /* --checkpoint-path (string) */ bool exclusive; /* --exclusive */ + int resv_ports; /* --resv_ports */ char *partition; /* --partition=n, -p n */ enum task_dist_states distribution; /* --distribution=, -m dist */ -- GitLab