diff --git a/NEWS b/NEWS index 8c73dd78dfaeb921509d3a3fcab48a062cb76c5a..970d14441a31419c7cd63ce51d9b89e7f0462908 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,9 @@ documents those changes that are of interest to users and administrators. -- Fix issue where if no clusters were added but yet a QOS needed to be deleted make it possible. -- SlurmDBD - change all timestamps to bigint from int to solve Y2038 problem. + -- Add salloc/sbatch/srun --spread-job to distribute tasks over as many nodes + as possible. This also treats the --ntasks-node-node option as a maximum + value. * Changes in Slurm 16.05.2 ========================== diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 1ee90a10d9493550a0a41ee4e3f9f64ba94ad71f..9037a7f12bfda351b49aa41ab59ea095795e8478 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -1203,6 +1203,11 @@ Restrict node selection to nodes with at least the specified number of sockets. See additional information under \fB\-B\fR option above when task/affinity plugin is enabled. +.TP +\fB\-\-spread\-job\fR> +Spread the job allocation over as many nodes as possible and attempt to +evenly distribute tasks across the allocated nodes. + .TP \fB\-\-switches\fR=<\fIcount\fR>[@<\fImax\-time\fR>] When a tree topology is used, this defines the maximum count of switches @@ -1482,6 +1487,9 @@ Same as \fB\-\-reservation\fR \fBSALLOC_SIGNAL\fR Same as \fB\-\-signal\fR .TP +\fBSALLOC_SPREAD_JOB\fR +Same as \fB\-\-spread\-job\fR +.TP \fBSALLOC_THREAD_SPEC\fR Same as \fB\-\-thread\-spec\fR .TP diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 0f5e0d088c9f4f26de4d2467e60cd577612087ba..97b5bda99239b6d1b6960a1ba33cefb92c1cedd5 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -1387,6 +1387,11 @@ Restrict node selection to nodes with at least the specified number of sockets. See additional information under \fB\-B\fR option above when task/affinity plugin is enabled. +.TP +\fB\-\-spread\-job\fR> +Spread the job allocation over as many nodes as possible and attempt to +evenly distribute tasks across the allocated nodes. + .TP \fB\-\-switches\fR=<\fIcount\fR>[@<\fImax\-time\fR>] When a tree topology is used, this defines the maximum count of switches @@ -1738,6 +1743,9 @@ Same as \fB\-\-requeue\fR \fBSBATCH_SIGNAL\fR Same as \fB\-\-signal\fR .TP +\fBSBATCH_SPREAD_JOB\fR +Same as \fB\-\-spread\-job\fR +.TP \fBSBATCH_THREAD_SPEC\fR Same as \fB\-\-thread\-spec\fR .TP diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index d2dd29beba4fe0959fcc7a9e7f959ded260b0d74..a0feb08049d28d98e1b8b93a6532f6a73cdc4b70 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1811,6 +1811,11 @@ Restrict node selection to nodes with at least the specified number of sockets. See additional information under \fB\-B\fR option above when task/affinity plugin is enabled. This option applies to job allocations. +.TP +\fB\-\-spread\-job\fR +Spread the job allocation over as many nodes as possible and attempt to +evenly distribute tasks across the allocated nodes. + .TP \fB\-\-switches\fR=<\fIcount\fR>[@<\fImax\-time\fR>] When a tree topology is used, this defines the maximum count of switches @@ -2498,6 +2503,9 @@ Same as \fB\-e, \-\-error\fR \fBSLURM_STDINMODE\fR Same as \fB\-i, \-\-input\fR .TP +\fBSLURM_SPREAD_JOB\fR +Same as \fB\-\-spread\-job\fR +.TP \fBSLURM_SRUN_REDUCE_TASK_EXIT_MSG\fR if set and non-zero, successive task exit messages with the same exit code will be printed only once. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 8e8f76243d069574d1d726cfb918889709109e9d..262c293ab127a500e39740d4584fe4e1e2389107 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -977,7 +977,7 @@ enum ctx_keys { #define PRIORITY_FLAGS_FAIR_TREE 0x0020 /* Prioritize by level in * account hierarchy. */ -/* These bits are set in the flags field of job_desc_msg_t */ +/* These bits are set in the bitflags field of job_desc_msg_t */ #define KILL_INV_DEP 0x00000001 /* Kill job on invalid dependency */ #define NO_KILL_INV_DEP 0x00000002 /* Don't kill job on invalid dependency */ #define HAS_STATE_DIR 0x00000004 /* Used by slurmctld to track state dir */ @@ -986,6 +986,7 @@ enum ctx_keys { #define TEST_NOW_ONLY 0x00000020 /* Test for immediately start only */ #define NODE_MEM_CALC 0x00000040 /* Per-node memory limit calculated */ #define NODE_REBOOT 0x00000080 /* Waiting for node reboot */ +#define SPREAD_JOB 0x00000100 /* Spread job across max node count */ /*****************************************************************************\ * SLURM HOSTLIST FUNCTIONS diff --git a/src/api/job_info.c b/src/api/job_info.c index 2007c7e1b359f98758ad7240b880e29103877afd..21f22701552613a3dc65a67a3303ee452443cfdc 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -955,6 +955,8 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) xstrcat(out, "KillOInInvalidDependent=Yes"); if (job_ptr->bitflags & NO_KILL_INV_DEP) xstrcat(out, "KillOInInvalidDependent=No"); + if (job_ptr->bitflags & SPREAD_JOB) + xstrcat(out, "SpreadJob=Yes"); } /****** END OF JOB RECORD ******/ diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index 3a1333f5add0ab321cec87be91dd22d42569e6cd..d4a1e4da592a995b26c0365229c4a48b2950cf10 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -202,7 +202,7 @@ static uint16_t _allocate_sc(struct job_record *job_ptr, bitstr_t *core_map, bool entire_sockets_only) { uint16_t cpu_count = 0, cpu_cnt = 0; - uint16_t si, cps, avail_cpus = 0, num_tasks = 0; + uint16_t si, cps, avail_cpus = 0, max_cpus, num_tasks = 0; uint32_t core_begin = cr_get_coremap_offset(node_i); uint32_t core_end = cr_get_coremap_offset(node_i+1); uint32_t c; @@ -432,6 +432,16 @@ static uint16_t _allocate_sc(struct job_record *job_ptr, bitstr_t *core_map, if (job_ptr->details->ntasks_per_node) avail_cpus = num_tasks * cpus_per_task; } + + if ((job_ptr->bit_flags & SPREAD_JOB) && + (job_ptr->details->ntasks_per_node != 0)) { + /* Treat ntasks_per_node as maximum */ + max_cpus = job_ptr->details->ntasks_per_node; + if (cpus_per_task > 1) + max_cpus *= cpus_per_task; + avail_cpus = MIN(avail_cpus, max_cpus); + } + if ((job_ptr->details->ntasks_per_node && (num_tasks < job_ptr->details->ntasks_per_node) && (job_ptr->details->overcommit == 0)) || diff --git a/src/salloc/opt.c b/src/salloc/opt.c index 9efc8ae4b39cf56a3a90577aba1220560a4659fd..6725842c6bcec0c111370fb8a3923c94e239a516 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -105,6 +105,7 @@ #define OPT_HINT 0x1a #define OPT_CPU_FREQ 0x1b #define OPT_THREAD_SPEC 0x1c +#define OPT_SPREAD_JOB 0x1d /* generic getopt_long flags, integers and *not* valid characters */ @@ -159,6 +160,7 @@ #define LONG_OPT_PROFILE 0x144 #define LONG_OPT_CPU_FREQ 0x145 #define LONG_OPT_GRES_FLAGS 0x146 +#define LONG_OPT_SPREAD_JOB 0x147 #define LONG_OPT_PRIORITY 0x160 #define LONG_OPT_POWER 0x162 #define LONG_OPT_THREAD_SPEC 0x163 @@ -416,16 +418,17 @@ env_vars_t env_vars[] = { {"SALLOC_PARTITION", OPT_STRING, &opt.partition, NULL }, {"SALLOC_POWER", OPT_POWER, NULL, NULL }, {"SALLOC_PROFILE", OPT_PROFILE, NULL, NULL }, + {"SALLOC_REQ_SWITCH", OPT_INT, &opt.req_switch, NULL }, {"SALLOC_QOS", OPT_STRING, &opt.qos, NULL }, {"SALLOC_RESERVATION", OPT_STRING, &opt.reservation, NULL }, {"SALLOC_SIGNAL", OPT_SIGNAL, NULL, NULL }, + {"SALLOC_SPREAD_JOB", OPT_SPREAD_JOB, NULL, NULL }, {"SALLOC_THREAD_SPEC", OPT_THREAD_SPEC,NULL, NULL }, {"SALLOC_TIMELIMIT", OPT_STRING, &opt.time_limit_str,NULL }, {"SALLOC_WAIT", OPT_IMMEDIATE, NULL, NULL }, {"SALLOC_WAIT_ALL_NODES",OPT_INT, &opt.wait_all_nodes,NULL }, - {"SALLOC_WCKEY", OPT_STRING, &opt.wckey, NULL }, - {"SALLOC_REQ_SWITCH", OPT_INT, &opt.req_switch, NULL }, {"SALLOC_WAIT4SWITCH", OPT_TIME_VAL, NULL, NULL }, + {"SALLOC_WCKEY", OPT_STRING, &opt.wckey, NULL }, {NULL, 0, NULL, NULL} }; @@ -622,6 +625,9 @@ _process_env_var(env_vars_t *e, const char *val) opt.core_spec = parse_int("thread_spec", val, true) | CORE_SPEC_THREAD; break; + case OPT_SPREAD_JOB: + opt.job_flags |= SPREAD_JOB; + break; default: /* do nothing */ break; @@ -715,6 +721,7 @@ void set_options(const int argc, char **argv) {"reservation", required_argument, 0, LONG_OPT_RESERVATION}, {"signal", required_argument, 0, LONG_OPT_SIGNAL}, {"sockets-per-node", required_argument, 0, LONG_OPT_SOCKETSPERNODE}, + {"spread-job", no_argument, 0, LONG_OPT_SPREAD_JOB}, {"switches", required_argument, 0, LONG_OPT_REQ_SWITCH}, {"tasks-per-node", required_argument, 0, LONG_OPT_NTASKSPERNODE}, {"thread-spec", required_argument, 0, LONG_OPT_THREAD_SPEC}, @@ -739,7 +746,7 @@ void set_options(const int argc, char **argv) opt.progname = xbasename(argv[0]); optind = 0; - while((opt_char = getopt_long(argc, argv, opt_string, + while ((opt_char = getopt_long(argc, argv, opt_string, optz, &option_index)) != -1) { switch (opt_char) { @@ -1276,6 +1283,9 @@ void set_options(const int argc, char **argv) opt.core_spec = parse_int("thread_spec", optarg, true) | CORE_SPEC_THREAD; break; + case LONG_OPT_SPREAD_JOB: + opt.job_flags |= SPREAD_JOB; + break; default: if (spank_process_option(opt_char, optarg) < 0) { error("Unrecognized command line parameter %c", @@ -2006,7 +2016,7 @@ static void _usage(void) #endif #endif " [--mail-type=type] [--mail-user=user] [--nice[=value]]\n" -" [--bell] [--no-bell] [--kill-command[=signal]]\n" +" [--bell] [--no-bell] [--kill-command[=signal]] [--spread-job]\n" " [--nodefile=file] [--nodelist=hosts] [--exclude=hosts]\n" " [--network=type] [--mem-per-cpu=MB] [--qos=qos]\n" " [--mem_bind=...] [--reservation=name] [--mcs-label=mcs]\n" @@ -2072,6 +2082,7 @@ static void _help(void) " --reboot reboot compute nodes before starting job\n" " -s, --oversubscribe oversubscribe resources with other jobs\n" " --signal=[B:]num[@time] send signal when time limit within time seconds\n" +" --spread-job spread job across as many nodes as possible\n" " --switches=max-switches{@max-time-to-wait}\n" " Optimum switches and max time to wait for optimum\n" " -S, --core-spec=cores count of reserved cores\n" diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 6eeabc359d543d65213981fcd7f7625a9a8d1863..267d3dc2b79e6b8916f6d007bf5c70056689b1e7 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -111,6 +111,7 @@ enum wrappers { #define OPT_CORE_SPEC 0x1a #define OPT_CPU_FREQ 0x1b #define OPT_POWER 0x1d +#define OPT_SPREAD_JOB 0x1e #define OPT_ARRAY_INX 0x20 #define OPT_PROFILE 0x21 #define OPT_HINT 0x22 @@ -180,6 +181,7 @@ enum wrappers { #define LONG_OPT_GRES_FLAGS 0x15a #define LONG_OPT_PRIORITY 0x160 #define LONG_OPT_KILL_INV_DEP 0x161 +#define LONG_OPT_SPREAD_JOB 0x162 #define LONG_OPT_MCS_LABEL 0x165 #define LONG_OPT_DEADLINE 0x166 @@ -473,6 +475,7 @@ env_vars_t env_vars[] = { {"SBATCH_REQUEUE", OPT_REQUEUE, NULL, NULL }, {"SBATCH_RESERVATION", OPT_STRING, &opt.reservation, NULL }, {"SBATCH_SIGNAL", OPT_SIGNAL, NULL, NULL }, + {"SBATCH_SPREAD_JOB", OPT_SPREAD_JOB, NULL, NULL }, {"SBATCH_THREAD_SPEC", OPT_THREAD_SPEC,NULL, NULL }, {"SBATCH_TIMELIMIT", OPT_STRING, &opt.time_limit_str,NULL }, {"SBATCH_WAIT", OPT_BOOL, &opt.wait, NULL }, @@ -661,6 +664,9 @@ _process_env_var(env_vars_t *e, const char *val) exit(error_exit); } break; + case OPT_SPREAD_JOB: + opt.job_flags |= SPREAD_JOB; + break; case OPT_GET_USER_ENV: if (val) _proc_get_user_env((char *)val); @@ -792,6 +798,7 @@ static struct option long_options[] = { {"reservation", required_argument, 0, LONG_OPT_RESERVATION}, {"signal", required_argument, 0, LONG_OPT_SIGNAL}, {"sockets-per-node", required_argument, 0, LONG_OPT_SOCKETSPERNODE}, + {"spread-job", no_argument, 0, LONG_OPT_SPREAD_JOB}, {"switches", required_argument, 0, LONG_OPT_REQ_SWITCH}, {"tasks-per-node",required_argument, 0, LONG_OPT_NTASKSPERNODE}, {"test-only", no_argument, 0, LONG_OPT_TEST_ONLY}, @@ -1858,6 +1865,9 @@ static void _set_options(int argc, char **argv) if (xstrcasecmp(optarg, "no") == 0) opt.job_flags |= NO_KILL_INV_DEP; break; + case LONG_OPT_SPREAD_JOB: + opt.job_flags |= SPREAD_JOB; + break; default: if (spank_process_option (opt_char, optarg) < 0) { error("Unrecognized command line parameter %c", @@ -3214,7 +3224,7 @@ static void _usage(void) " [--cpu-freq=min[-max[:gov]] [--power=flags] [--gres-flags=opts]\n" " [--switches=max-switches{@max-time-to-wait}] [--reboot]\n" " [--core-spec=cores] [--thread-spec=threads] [--bb=burst_buffer_spec]\n" -" [--array=index_values] [--profile=...] [--ignore-pbs]\n" +" [--array=index_values] [--profile=...] [--ignore-pbs] [--spread-job]\n" " [--export[=names]] [--export-file=file|fd] executable [args...]\n"); } @@ -3287,6 +3297,7 @@ static void _help(void) " -s, --oversubscribe over subscribe resources with other jobs\n" " -S, --core-spec=cores count of reserved cores\n" " --signal=[B:]num[@time] send signal when time limit within time seconds\n" +" --spread-job spread job across as many nodes as possible\n" " --switches=max-switches{@max-time-to-wait}\n" " Optimum switches and max time to wait for optimum\n" " --thread-spec=threads count of reserved threads\n" diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index c0ac413181c10bc12d91b199aa963828dc19c2cc..f0f0e4a068de5d38f94dbc0e4f297f164fbd22a2 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -6961,6 +6961,14 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, */ detail_ptr->mc_ptr = _set_multi_core_data(job_desc); + if ((job_ptr->bit_flags & SPREAD_JOB) && (detail_ptr->max_nodes == 0) && + (detail_ptr->num_tasks != 0)) { + if (detail_ptr->min_nodes == 0) + detail_ptr->min_nodes = 1; + detail_ptr->max_nodes = + MIN(node_record_count, detail_ptr->num_tasks); + } + return SLURM_SUCCESS; } diff --git a/src/srun/libsrun/opt.c b/src/srun/libsrun/opt.c index 61ac0f49578fee81e42c1bf384316d95b2d64353..867c5feeeb14ebc90a458b4b7e1836b2994b707a 100644 --- a/src/srun/libsrun/opt.c +++ b/src/srun/libsrun/opt.c @@ -116,6 +116,7 @@ #define OPT_PROFILE 0x20 #define OPT_EXPORT 0x21 #define OPT_HINT 0x22 +#define OPT_SPREAD_JOB 0x23 /* generic getopt_long flags, integers and *not* valid characters */ #define LONG_OPT_HELP 0x100 @@ -192,6 +193,7 @@ #define LONG_OPT_LAUNCH_CMD 0x156 #define LONG_OPT_PROFILE 0x157 #define LONG_OPT_EXPORT 0x158 +#define LONG_OPT_SPREAD_JOB 0x159 #define LONG_OPT_PRIORITY 0x160 #define LONG_OPT_ACCEL_BIND 0x161 #define LONG_OPT_MCS_LABEL 0x165 @@ -621,6 +623,7 @@ env_vars_t env_vars[] = { {"SLURM_RESERVATION", OPT_STRING, &opt.reservation, NULL }, {"SLURM_RESTART_DIR", OPT_STRING, &opt.restart_dir , NULL }, {"SLURM_RESV_PORTS", OPT_RESV_PORTS, NULL, NULL }, +{"SLURM_SPREAD_JOB", OPT_SPREAD_JOB, NULL, NULL }, {"SLURM_SIGNAL", OPT_SIGNAL, NULL, NULL }, {"SLURM_SRUN_MULTI", OPT_MULTI, NULL, NULL }, {"SLURM_STDERRMODE", OPT_STRING, &opt.efname, NULL }, @@ -859,6 +862,9 @@ _process_env_var(env_vars_t *e, const char *val) opt.core_spec = _get_int(val, "thread_spec", true) | CORE_SPEC_THREAD; break; + case OPT_SPREAD_JOB: + opt.job_flags |= SPREAD_JOB; + break; default: /* do nothing */ break; @@ -994,6 +1000,7 @@ static void _set_options(const int argc, char **argv) {"signal", required_argument, 0, LONG_OPT_SIGNAL}, {"slurmd-debug", required_argument, 0, LONG_OPT_DEBUG_SLURMD}, {"sockets-per-node", required_argument, 0, LONG_OPT_SOCKETSPERNODE}, + {"spread-job", no_argument, 0, LONG_OPT_SPREAD_JOB}, {"switches", required_argument, 0, LONG_OPT_REQ_SWITCH}, {"task-epilog", required_argument, 0, LONG_OPT_TASK_EPILOG}, {"task-prolog", required_argument, 0, LONG_OPT_TASK_PROLOG}, @@ -1752,6 +1759,9 @@ static void _set_options(const int argc, char **argv) case LONG_OPT_COMPRESS: opt.compress = parse_compress_type(optarg); break; + case LONG_OPT_SPREAD_JOB: + opt.job_flags |= SPREAD_JOB; + break; default: if (spank_process_option (opt_char, optarg) < 0) { exit(error_exit); @@ -2707,7 +2717,7 @@ static void _usage(void) " [--prolog=fname] [--epilog=fname]\n" " [--task-prolog=fname] [--task-epilog=fname]\n" " [--ctrl-comm-ifhn=addr] [--multi-prog] [--mcs-label=mcs]\n" -" [--cpu-freq=min[-max[:gov]] [--power=flags]\n" +" [--cpu-freq=min[-max[:gov]] [--power=flags] [--spread-job]\n" " [--switches=max-switches{@max-time-to-wait}] [--reboot]\n" " [--core-spec=cores] [--thread-spec=threads]\n" " [--bb=burst_buffer_spec] [--bbf=burst_buffer_file]\n" @@ -2804,6 +2814,7 @@ static void _help(void) " -S, --core-spec=cores count of reserved cores\n" " --signal=[B:]num[@time] send signal when time limit within time seconds\n" " --slurmd-debug=level slurmd debug level\n" +" --spread-job spread job across as many nodes as possible\n" " --switches=max-switches{@max-time-to-wait}\n" " Optimum switches and max time to wait for optimum\n" " --task-epilog=program run \"program\" after launching task\n"