diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 84cbddd7cabdfc3f02e2392785be1188d78016e5..559e95a604195e7f794d8ad73ae39743ef4fac74 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -387,12 +387,14 @@ YYYY\-MM\-DD[THH:MM[:SS]]] .TP \fB\-\-delay\-boot\fR=<\fIminutes\fR> -If nodes with the features requested by the job are expected to be available -within this time period, then wait for those nodes rather than booting currently -available nodes into the desired state. +Do not reboot nodes in order to satified this job's feature specification if +the job has been eligible to run for less than this time period. +If the job has waited for less than the specified period, it will use only +nodes which already have the specified features. The argument is in units of minutes. -A default value may be specified using the \fBDelayBoot\fR configuration -parameter in the knl.conf file, otherwise the default value is zero (no delay). +A default value may be set by a system administrator using the \fBdelay_boot\fR +option of the \fBSchedulerParameters\fR configuration parameter in the +slurm.conf file, otherwise the default value is zero (no delay). .TP \fB\-d\fR, \fB\-\-dependency\fR=<\fIdependency_list\fR> diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index f7ed187471fec719614d243d0a9becad919deed4..25cd961eaf0d71e0fbc199ee67fc040119f01e20 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -401,12 +401,14 @@ YYYY\-MM\-DD[THH:MM[:SS]]] .TP \fB\-\-delay\-boot\fR=<\fIminutes\fR> -If nodes with the features requested by the job are expected to be available -within this time period, then wait for those nodes rather than booting currently -available nodes into the desired state. +Do not reboot nodes in order to satified this job's feature specification if +the job has been eligible to run for less than this time period. +If the job has waited for less than the specified period, it will use only +nodes which already have the specified features. The argument is in units of minutes. -A default value may be specified using the \fBDelayBoot\fR configuration -parameter in the knl.conf file, otherwise the default value is zero (no delay). +A default value may be set by a system administrator using the \fBdelay_boot\fR +option of the \fBSchedulerParameters\fR configuration parameter in the +slurm.conf file, otherwise the default value is zero (no delay). .TP \fB\-d\fR, \fB\-\-dependency\fR=<\fIdependency_list\fR> diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 579e3193370badf84e501fab7279c418a1229c5c..c9bd6c3c739455670dafe7509f552d5d9335b22c 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -596,12 +596,14 @@ This option applies only to job allocations. .TP \fB\-\-delay\-boot\fR=<\fIminutes\fR> -If nodes with the features requested by the job are expected to be available -within this time period, then wait for those nodes rather than booting currently -available nodes into the desired state. +Do not reboot nodes in order to satified this job's feature specification if +the job has been eligible to run for less than this time period. +If the job has waited for less than the specified period, it will use only +nodes which already have the specified features. The argument is in units of minutes. -A default value may be specified using the \fBDelayBoot\fR configuration -parameter in the knl.conf file, otherwise the default value is zero (no delay). +A default value may be set by a system administrator using the \fBdelay_boot\fR +option of the \fBSchedulerParameters\fR configuration parameter in the +slurm.conf file, otherwise the default value is zero (no delay). This option applies only to job allocations. diff --git a/doc/man/man5/knl.conf.5 b/doc/man/man5/knl.conf.5 index 3a115a24c4d6f9b391962899ab55cf314d2bb532..f1abe8195816e3de75b3e7d4ad9dac5789474ada 100644 --- a/doc/man/man5/knl.conf.5 +++ b/doc/man/man5/knl.conf.5 @@ -122,14 +122,6 @@ The value can include one of the possible values identified with the \fBAllowNUMA\fR configuration parameter above. The default value is "a2a". -.TP -\fBDelayBoot\fR -If nodes with the MCDRAM and NUMA state requested by the job are expected to -be available within this time period, then wait for those nodes rather than -booting currently available nodes into the desired state. -May be overridden by a job's --delay-boot option. -The argument value is in minutes with a default value of zero (no delay). - .TP \fBLogFile\fR Fully qualified path to a log file. diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 501eedf6b3766fcd09f088e3e73d3a725018113f..8e4ed37c3458e1746d6d71f59516499b160f4b71 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -2595,6 +2595,15 @@ This option may improve system responsiveness when large numbers of jobs (many hundreds) are submitted at the same time, but it will delay the initiation time of individual jobs. Also see \fBdefault_queue_depth\fR above. .TP +\fBdelay_boot=#\fR +Do not reboot nodes in order to satified this job's feature specification if +the job has been eligible to run for less than this time period. +If the job has waited for less than the specified period, it will use only +nodes which already have the specified features. +The argument is in units of minutes. +Individual jobs may override this default value with the \fB\-\-delay\-boot\fR +option. +.TP \fBdisable_user_top\fB Disable use of the "scontrol top" command by non-privileged users. .TP diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 7dfe053fa447ef71fc6319296eb8d4a4ee06daff..46d00679e6a5edf4bee18153d15326545f8c01ea 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -130,6 +130,7 @@ time_t last_job_update; /* time of last update to job records */ /* Local variables */ static int bf_min_age_reserve = 0; +static uint32_t delay_boot = 0; static uint32_t highest_prio = 0; static uint32_t lowest_prio = TOP_PRIORITY; static int hash_table_size = 0; @@ -4114,12 +4115,35 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, { static time_t sched_update = 0; static int defer_sched = 0; - char *sched_params, *tmp_ptr; + char *key, *sched_params, *tmp_ptr; int error_code, i; bool no_alloc, top_prio, test_only, too_fragmented, independent; struct job_record *job_ptr; time_t now = time(NULL); + if (sched_update != slurmctld_conf.last_update) { + sched_update = slurmctld_conf.last_update; + sched_params = slurm_get_sched_params(); + if (sched_params && strstr(sched_params, "defer")) + defer_sched = 1; + else + defer_sched = 0; + key = strstr(sched_params, "delay_boot="); + if (key) { + i = time_str2secs(key + 11); + if (i != NO_VAL) + delay_boot = i; + } + bf_min_age_reserve = 0; + if (sched_params && + (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) { + int min_age = atoi(tmp_ptr + 19); + if (min_age > 0) + bf_min_age_reserve = min_age; + } + xfree(sched_params); + } + if (job_specs->array_bitmap) i = bit_set_count(job_specs->array_bitmap); else @@ -4177,22 +4201,6 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, else too_fragmented = false; - if (sched_update != slurmctld_conf.last_update) { - sched_update = slurmctld_conf.last_update; - sched_params = slurm_get_sched_params(); - if (sched_params && strstr(sched_params, "defer")) - defer_sched = 1; - else - defer_sched = 0; - bf_min_age_reserve = 0; - if (sched_params && - (tmp_ptr = strstr(sched_params, "bf_min_age_reserve="))) { - int min_age = atoi(tmp_ptr + 19); - if (min_age > 0) - bf_min_age_reserve = min_age; - } - xfree(sched_params); - } if (defer_sched == 1) too_fragmented = true; @@ -7051,7 +7059,9 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, job_ptr->job_state = JOB_PENDING; job_ptr->time_limit = job_desc->time_limit; job_ptr->deadline = job_desc->deadline; - if (job_desc->delay_boot != NO_VAL) + if (job_desc->delay_boot == NO_VAL) + job_ptr->delay_boot = delay_boot; + else job_ptr->delay_boot = job_desc->delay_boot; if (job_desc->time_min != NO_VAL) job_ptr->time_min = job_desc->time_min; diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 2678b4dfb1ae232c8309ab44fa47f2b97beb377c..79a6a386a5d7a523ef80c6fb9ee386ba9a7091d4 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1068,8 +1068,8 @@ _get_req_features(struct node_set *node_set_ptr, int node_set_size, job_ptr->details->req_node_bitmap = NULL; } saved_min_cpus = job_ptr->details->min_cpus; - /* Don't mess with max_cpus here since it is only set (as of - * 2.2 to be a limit and not user configurable. */ + /* Don't mess with max_cpus here since it is only set to be a limit + * and not user configurable. */ job_ptr->details->min_cpus = 1; tmp_node_set_ptr = xmalloc(sizeof(struct node_set) * node_set_size * 2); @@ -2132,6 +2132,23 @@ static bool _first_array_task(struct job_record *job_ptr) return false; } +/* Return TRUE if the job can be rebooted now to change the node's features */ +static bool _job_reboot_test(struct job_record *job_ptr, bool test_only) +{ + if (!node_features_g_user_update(job_ptr->user_id)) + return false; + + if (test_only) + return true; + + if (job_ptr->details && + ((job_ptr->details->begin_time == 0) || + ((job_ptr->details->begin_time+job_ptr->delay_boot) > time(NULL)))) + return false; + + return true; +} + /* * select_nodes - select and allocate nodes to a specific job * IN job_ptr - pointer to the job record @@ -2237,7 +2254,7 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, } /* build sets of usable nodes based upon their configuration */ - can_reboot = node_features_g_user_update(job_ptr->user_id); + can_reboot = _job_reboot_test(job_ptr, test_only); error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size, err_msg, test_only, can_reboot); if (error_code) diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index a75c2f4a8370eec39bd77a4c75cd2e1eed884574..0bf477089191db97f5e0091b2d54f22bceffc223 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -582,6 +582,11 @@ struct job_record { uint16_t batch_flag; /* 1 or 2 if batch job (with script), * 2 indicates retry mode (one retry) */ char *batch_host; /* host executing batch script */ + double billable_tres; /* calculated billable tres for the + * job, as defined by the partition's + * billing weight. Recalculated upon job + * resize. Cannot be calculated until + * the job is alloocated resources. */ uint32_t bit_flags; /* various flags */ char *burst_buffer; /* burst buffer specification */ char *burst_buffer_state; /* burst buffer state */ @@ -594,11 +599,6 @@ struct job_record { * by the job, decremented while job is * completing (N/A for bluegene * systems) */ - double billable_tres; /* calculated billable tres for the - * job, as defined by the partition's - * billing weight. Recalculated upon job - * resize. Cannot be calculated until - * the job is alloocated resources. */ uint16_t cr_enabled; /* specify if Consumable Resources * is enabled. Needed since CR deals * with a finer granularity in its