diff --git a/NEWS b/NEWS index a5b72cdba7297d598ab034d4917776a263446449..93a49b0b175c5e3e64ca21692898417d8a1ddfb9 100644 --- a/NEWS +++ b/NEWS @@ -123,7 +123,9 @@ documents those changes that are of interest to users and admins. -- Honor ntasks-per-node option with exclusive node allocations. -- sched/backfill - Prevent invalid memory reference if bf_continue option is configured and slurm is reconfigured during one of the sleep cycles or if - there are any changes to the partition configuration. + there are any changes to the partition configuration or if the normal + scheduler runs and starts a job that the backfill scheduler is actively + working on. -- Update man pages information about acct-freq and JobAcctGatherFrequency to reflect only the latest supported format. -- Minor document update to include note about PrivateData=Usage for the @@ -140,6 +142,9 @@ documents those changes that are of interest to users and admins. -- init scripts ignore quotes around Pid file name specifications. -- Fixed typo about command case in quickstart.html. -- task/cgroup - handle new cpuset files, similar to commit c4223940. + -- Replace the tempname() function call with mkstemp(). + -- Fix for --cpu_bind=map_cpu/mask_cpu/map_ldom/mask_ldom plus + --mem_bind=map_mem/mask_mem options, broken in 2.6.2. * Changes in Slurm 2.6.3 ======================== diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 4d0a39d1d61aef2c0d52629cc65ee72df7f8c806..7090850e370f7d353caf3dbbf34f55f1a74dc81f 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1287,6 +1287,8 @@ Supported values are "YES" and "NO". The default value is "NO". \fBPriorityFlags\fR Flags to modify priority behavior Applicable only if PriorityType=priority/multifactor. +The keywords below have no associated value +(e.g. "PriorityFlags=ACCRUE_ALWAYS,SMALL_RELATIVE_TO_TIME"). .RS .TP 17 \fBACCRUE_ALWAYS\fR @@ -1784,8 +1786,14 @@ Multiple options may be comma separated. The default number of jobs to attempt scheduling (i.e. the queue depth) when a running job completes or other routine actions occur. The full queue will be tested on a less frequent basis. The default value is 100. -In the case of large clusters, configuring a relatively +In the case of large clusters (more than 1000 nodes), configuring a relatively small value may be desirable. +Specifying a large value (say 1000 or higher) can be expected to result in +poor system responsiveness since this scheduling logic will not release +locks for other events to occur. +It would be better to let the backfill scheduler process a larger number of jobs +(see \fBmax_job_bf\fR, \fBbf_continue\fR and other options here for more +information). .TP \fBdefer\fR Setting this option will avoid attempting to schedule each job @@ -1840,6 +1848,11 @@ This option applies only to \fBSchedulerType=sched/backfill\fR. The number of minutes into the future to look when considering jobs to schedule. Higher values result in more overhead and less responsiveness. The default value is 1440 minutes (one day). +A value at least as long as the highest allowed time limit is generally +advisable to prevent job starvation. +In order limit the amount of data managed by the backfill scheduler, +if the value of \fBbf_window\fR is increased, then it is generally advisable +to also increase \fBbf_resolution\fR. This option applies only to \fBSchedulerType=sched/backfill\fR. .TP \fBmax_job_bf=#\fR diff --git a/src/common/slurm_resource_info.c b/src/common/slurm_resource_info.c index 17c7641d06187c0ac2fbfb7e3be5f35d943021aa..c7feeaf01ad72acf172f839fa247c53efbadfb7b 100644 --- a/src/common/slurm_resource_info.c +++ b/src/common/slurm_resource_info.c @@ -305,6 +305,7 @@ int slurm_verify_cpu_bind(const char *arg, char **cpu_bind, (strncasecmp(tok, "mapcpu", 6) == 0)) { char *list; list = strsep(&tok, ":="); + list = strsep(&tok, ":="); /* THIS IS NOT REDUNDANT */ _clear_then_set((int *)flags, bind_bits, CPU_BIND_MAP); xfree(*cpu_bind); if (list && *list) { @@ -319,6 +320,7 @@ int slurm_verify_cpu_bind(const char *arg, char **cpu_bind, (strncasecmp(tok, "maskcpu", 7) == 0)) { char *list; list = strsep(&tok, ":="); + list = strsep(&tok, ":="); /* THIS IS NOT REDUNDANT */ _clear_then_set((int *)flags, bind_bits, CPU_BIND_MASK); xfree(*cpu_bind); if (list && *list) { @@ -337,6 +339,7 @@ int slurm_verify_cpu_bind(const char *arg, char **cpu_bind, (strncasecmp(tok, "mapldom", 7) == 0)) { char *list; list = strsep(&tok, ":="); + list = strsep(&tok, ":="); /* THIS IS NOT REDUNDANT */ _clear_then_set((int *)flags, bind_bits, CPU_BIND_LDMAP); xfree(*cpu_bind); @@ -352,6 +355,7 @@ int slurm_verify_cpu_bind(const char *arg, char **cpu_bind, (strncasecmp(tok, "maskldom", 8) == 0)) { char *list; list = strsep(&tok, ":="); + list = strsep(&tok, ":="); /* THIS IS NOT REDUNDANT */ _clear_then_set((int *)flags, bind_bits, CPU_BIND_LDMASK); xfree(*cpu_bind); @@ -498,6 +502,7 @@ int slurm_verify_mem_bind(const char *arg, char **mem_bind, (strncasecmp(tok, "mapmem", 6) == 0)) { char *list; list = strsep(&tok, ":="); + list = strsep(&tok, ":="); /* THIS IS NOT REDUNDANT */ _clear_then_set((int *)flags, bind_bits, MEM_BIND_MAP); xfree(*mem_bind); if (list && *list) { @@ -511,6 +516,7 @@ int slurm_verify_mem_bind(const char *arg, char **mem_bind, (strncasecmp(tok, "maskmem", 7) == 0)) { char *list; list = strsep(&tok, ":="); + list = strsep(&tok, ":="); /* THIS IS NOT REDUNDANT */ _clear_then_set((int *)flags, bind_bits, MEM_BIND_MASK); xfree(*mem_bind); if (list && *list) { diff --git a/src/plugins/mpi/pmi2/spawn.c b/src/plugins/mpi/pmi2/spawn.c index 98b1194c88f70535cce6da734d7561680d7391b0..7553b02c80f14b3535f818c60792abe46db2d3b4 100644 --- a/src/plugins/mpi/pmi2/spawn.c +++ b/src/plugins/mpi/pmi2/spawn.c @@ -525,17 +525,17 @@ static int _exec_srun_multiple(spawn_req_t *req, char **env) { int argc, ntasks, i, j, spawn_cnt, fd; - char **argv = NULL, *multi_prog = NULL, *buf = NULL; + char **argv = NULL, *buf = NULL; spawn_subcmd_t *subcmd = NULL; + char fbuf[128]; debug3("mpi/pmi2: in _exec_srun_multiple"); /* create a tmp multi_prog file */ /* TODO: how to delete the file? */ - multi_prog = tempnam(NULL, NULL); - fd = open(multi_prog, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + sprintf(fbuf, "/tmp/%d.XXXXXX", getpid()); + fd = mkstemp(fbuf); if (fd < 0) { - error("mpi/pmi2: failed to open multi-prog file %s: %m", - multi_prog); + error("mpi/pmi2: failed to open multi-prog file %s: %m", fbuf); return SLURM_ERROR; } ntasks = 0; @@ -576,7 +576,7 @@ _exec_srun_multiple(spawn_req_t *req, char **env) job_info.srun_opt->nodelist); } argv[j ++] = "--multi-prog"; - argv[j ++] = multi_prog; + argv[j ++] = fbuf; argv[j ++] = NULL; debug3("mpi/mpi2: to execve"); diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 7b07af3b785e04bb828ab3b0a08b0f5ad4138bb0..c556646b2399c4859ef35e9758fcf99c5680feb9 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -843,6 +843,7 @@ static int _attempt_backfill(void) later_start = now; TRY_LATER: if ((time(NULL) - sched_start) >= sched_timeout) { + uint32_t save_job_id = job_ptr->job_id; uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { @@ -862,6 +863,18 @@ static int _attempt_backfill(void) rc = 1; break; } + + /* With bf_continue configured, the original job could + * have been scheduled or cancelled and purged. + * Revalidate job the record here. */ + if ((job_ptr->magic != JOB_MAGIC) || + (job_ptr->job_id != save_job_id)) + continue; + if (!IS_JOB_PENDING(job_ptr)) + continue; + if (!avail_front_end(job_ptr)) + continue; /* No available frontend */ + job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index cbb36c748169779677c3e0b1489f3420439e2892..f359d6f0ad7f48de61b5386c67f66d4b5880f403 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -634,13 +634,13 @@ _fill_registration_msg(slurm_node_registration_status_msg_t *msg) if (first_msg) { first_msg = false; - info("Procs=%u Boards=%u Sockets=%u Cores=%u Threads=%u " + info("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u " "Memory=%u TmpDisk=%u Uptime=%u", msg->cpus, msg->boards, msg->sockets, msg->cores, msg->threads, msg->real_memory, msg->tmp_disk, msg->up_time); } else { - debug3("Procs=%u Boards=%u Sockets=%u Cores=%u Threads=%u " + debug3("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u " "Memory=%u TmpDisk=%u Uptime=%u", msg->cpus, msg->boards, msg->sockets, msg->cores, msg->threads, msg->real_memory, msg->tmp_disk, @@ -856,7 +856,7 @@ _read_config(void) if (cf->fast_schedule) { info("Node configuration differs from hardware: " "CPUs=%u:%u(hw) Boards=%u:%u(hw) " - "Sockets=%u:%u(hw) CoresPerSocket=%u:%u(hw) " + "SocketsPerBoard=%u:%u(hw) CoresPerSocket=%u:%u(hw) " "ThreadsPerCore=%u:%u(hw)", conf->cpus, conf->actual_cpus, conf->boards, conf->actual_boards, @@ -871,7 +871,7 @@ _read_config(void) "the bitmaps the slurmctld must create before " "the slurmd registers.\n" " CPUs=%u:%u(hw) Boards=%u:%u(hw) " - "Sockets=%u:%u(hw) CoresPerSocket=%u:%u(hw) " + "SocketsPerBoard=%u:%u(hw) CoresPerSocket=%u:%u(hw) " "ThreadsPerCore=%u:%u(hw)", conf->cpus, conf->actual_cpus, conf->boards, conf->actual_boards, @@ -1191,7 +1191,7 @@ _print_config(void) &conf->actual_threads, &conf->block_map_size, &conf->block_map, &conf->block_map_inv); - printf("CPUs=%u Boards=%u Sockets=%u CoresPerSocket=%u " + printf("CPUs=%u Boards=%u SocketsPerBoard=%u CoresPerSocket=%u " "ThreadsPerCore=%u ", conf->actual_cpus, conf->actual_boards, conf->actual_sockets, conf->actual_cores, conf->actual_threads); diff --git a/testsuite/expect/test1.89 b/testsuite/expect/test1.89 index 89c08908df5d0238297cdea6e4646636775dc87d..c691194c8ccded40c3e0e8af4d57455b3458ecc5 100755 --- a/testsuite/expect/test1.89 +++ b/testsuite/expect/test1.89 @@ -130,6 +130,7 @@ expect { set timeout 1 expect { -re $prompt { + exp_continue } timeout { } diff --git a/testsuite/expect/test1.90 b/testsuite/expect/test1.90 index 2a3007204777388ea1c83c425dbdd75d2ff961ed..bc10c07f57237f62319891142c75178b15c8fd16 100755 --- a/testsuite/expect/test1.90 +++ b/testsuite/expect/test1.90 @@ -142,6 +142,7 @@ expect { set timeout 1 expect { -re $prompt { + exp_continue } timeout { }