diff --git a/NEWS b/NEWS index f5648c45cd558e7a190931a5c8a99104003694cc..7d8bcf78355be846f3de428b30f299eafba40030 100644 --- a/NEWS +++ b/NEWS @@ -104,6 +104,8 @@ documents those changes that are of interest to users and administrators. -- sbatch --ntasks option to take precedence over --ntasks-per-node plus node count, as documented. Set SLURM_NTASKS/SLURM_NPROCS environment variables accordingly. + -- MYSQL - Make sure suspended time is only subtracted from the CPU TRES + as it is the only TRES that can be given to another job while suspended. * Changes in Slurm 15.08.1 ========================== diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index d1c7d08305782f7bdc99b7ad7a14347127863e2c..809e9201f12334fcaf99dddc20d613f29c5c1ca4 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -1074,9 +1074,10 @@ ignored if \fISchedulerType=sched/wiki\fR or .TP \fB\-\-no\-requeue\fR -Specifies that the batch job should not be requeued after node failure. +Specifies that the batch job should never be requeued under any circumstances. Setting this option will prevent system administrators from being able -to restart the job (for example, after a scheduled downtime). +to restart the job (for example, after a scheduled downtime), recover from +a node failure, or be requeued upon preemption by a higher priority job. When a job is requeued, the batch script is initiated from its beginning. Also see the \fB\-\-requeue\fR option. The \fIJobRequeue\fR configuration parameter controls the default @@ -1281,7 +1282,9 @@ silently ignored. .TP \fB\-\-requeue\fR -Specifies that the batch job should be requeued after node failure. +Specifies that the batch job should eligible to being requeue. +The job may be requeued explicitly by a system administrator, after node +failure, or upon preemption by a higher priority job. When a job is requeued, the batch script is initiated from its beginning. Also see the \fB\-\-no\-requeue\fR option. The \fIJobRequeue\fR configuration parameter controls the default diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 314bb260ec6976c4cde77318c7be84a0acf34a1a..da3587510cad63ac58d7a9718797854dc8c9e02a 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1183,11 +1183,13 @@ By default, any existing file is truncated. .TP \fBJobRequeue\fR -This option controls what to do by default after a node failure. -If \fBJobRequeue\fR is set to a value of 1, then any batch job running -on the failed node will be requeued for execution on different nodes. -If \fBJobRequeue\fR is set to a value of 0, then any job running -on the failed node will be terminated. +This option controls the default ability for batch jobs to be requeued. +Jobs may be requeued explicitly by a system administrator, after node +failure, or upon preemption by a higher priority job. +If \fBJobRequeue\fR is set to a value of 1, then batch job may be requeued +unless explicitly disabled by the user. +If \fBJobRequeue\fR is set to a value of 0, then batch job will not be requeued +unless explicitly enabled by the user. Use the \fBsbatch\fR \fI\-\-no\-requeue\fR or \fI\-\-requeue\fR option to change the default behavior for individual jobs. The default value is 1. diff --git a/src/plugins/accounting_storage/mysql/as_mysql_rollup.c b/src/plugins/accounting_storage/mysql/as_mysql_rollup.c index 84fe858694e59fea1aa308a1fbd83c66000257c1..5c2befc511de7e46184bc9cbcefcc3758b54194b 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_rollup.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_rollup.c @@ -308,7 +308,8 @@ static void _add_tres_2_list(List tres_list, char *tres_str, int seconds) } static void _add_tres_time_2_list(List tres_list, char *tres_str, - int type, int seconds, bool times_count) + int type, int seconds, int suspend_seconds, + bool times_count) { char *tmp_str = tres_str; int id; @@ -321,6 +322,8 @@ static void _add_tres_time_2_list(List tres_list, char *tres_str, return; while (tmp_str) { + int loc_seconds = seconds; + id = atoi(tmp_str); if (id < 1) { error("_add_tres_time_2_list: no id " @@ -333,9 +336,18 @@ static void _add_tres_time_2_list(List tres_list, char *tres_str, xassert(0); break; } - count = slurm_atoull(++tmp_str); - time = count * seconds; + /* Take away suspended time from TRES that are idle when the + * job was suspended, currently only CPU's fill that bill. + */ + if (suspend_seconds && (id == TRES_CPU)) { + loc_seconds -= suspend_seconds; + if (loc_seconds < 1) + loc_seconds = 0; + } + + count = slurm_atoull(++tmp_str); + time = count * loc_seconds; loc_tres = _add_time_tres(tres_list, type, id, time, times_count); if (loc_tres && !loc_tres->count) @@ -810,7 +822,7 @@ static local_cluster_usage_t *_setup_cluster_usage(mysql_conn_t *mysql_conn, _add_tres_time_2_list(c_usage->loc_tres, row[EVENT_REQ_TRES], TIME_DOWN, - seconds, 0); + seconds, 0, 0); /* Now remove this time if there was a disconnected slurmctld during the @@ -880,7 +892,7 @@ extern int as_mysql_hourly_rollup(mysql_conn_t *mysql_conn, char *job_req_inx[] = { "job.job_db_inx", - "job.id_job", +// "job.id_job", "job.id_assoc", "job.id_wckey", "job.array_task_pending", @@ -896,7 +908,7 @@ extern int as_mysql_hourly_rollup(mysql_conn_t *mysql_conn, char *job_str = NULL; enum { JOB_REQ_DB_INX, - JOB_REQ_JOBID, +// JOB_REQ_JOBID, JOB_REQ_ASSOCID, JOB_REQ_WCKEYID, JOB_REQ_ARRAY_PENDING, @@ -1117,7 +1129,7 @@ extern int as_mysql_hourly_rollup(mysql_conn_t *mysql_conn, xfree(query); while ((row = mysql_fetch_row(result))) { - uint32_t job_id = slurm_atoul(row[JOB_REQ_JOBID]); + //uint32_t job_id = slurm_atoul(row[JOB_REQ_JOBID]); uint32_t assoc_id = slurm_atoul(row[JOB_REQ_ASSOCID]); uint32_t wckey_id = slurm_atoul(row[JOB_REQ_WCKEYID]); uint32_t array_pending = @@ -1130,7 +1142,7 @@ extern int as_mysql_hourly_rollup(mysql_conn_t *mysql_conn, List loc_tres = NULL; uint64_t row_energy = 0; int loc_seconds = 0; - int seconds = 0; + int seconds = 0, suspend_seconds = 0; if (row[JOB_REQ_ENERGY]) row_energy = slurm_atoull(row[JOB_REQ_ENERGY]); @@ -1184,21 +1196,15 @@ extern int as_mysql_hourly_rollup(mysql_conn_t *mysql_conn, if (row_start > local_start) local_start = row_start; - if (row_end < local_end) + if (!local_end || row_end < local_end) local_end = row_end; tot_time = (local_end - local_start); - if (tot_time < 1) - continue; - seconds -= tot_time; + if (tot_time > 0) + suspend_seconds += tot_time; } mysql_free_result(result2); } - if (seconds < 1) { - debug4("This job (%u) was suspended " - "the entire hour", job_id); - continue; - } if (last_id != assoc_id) { a_usage = xmalloc(sizeof(local_id_usage_t)); @@ -1247,11 +1253,13 @@ extern int as_mysql_hourly_rollup(mysql_conn_t *mysql_conn, } _add_tres_time_2_list(loc_tres, row[JOB_REQ_TRES], - TIME_ALLOC, seconds, 0); + TIME_ALLOC, seconds, + suspend_seconds, 0); if (w_usage) _add_tres_time_2_list(w_usage->loc_tres, row[JOB_REQ_TRES], - TIME_ALLOC, seconds, 0); + TIME_ALLOC, seconds, + suspend_seconds, 0); _add_time_tres(loc_tres, TIME_ALLOC, TRES_ENERGY, diff --git a/testsuite/expect/test14.10 b/testsuite/expect/test14.10 index d712a17bbc44fc2c473e314ebaf2fbfc83b8276e..36bb6da1666d015bae5c0b5e1ef8a79c9efc4f5d 100755 --- a/testsuite/expect/test14.10 +++ b/testsuite/expect/test14.10 @@ -47,6 +47,9 @@ if {[test_front_end] != 0} { } elseif {[slurmd_user_root] == 0} { send_user "\nWARNING: This test requires that the SlurmdUser be root\n" exit 0 +} elseif {[get_node_cnt] < 2} { + send_user "\nWARNING: This test requires that the configuration has at least 2 nodes.\n" + exit 0 } spawn $bin_bash -c "exec $sinfo -tidle -h -o%n | head -n2 |tr \'\n\' ' ' " @@ -65,8 +68,8 @@ expect { } } if {![string compare $node1 ""] || ![string compare $node2 ""]} { - send_user "\nFAILURE: did not get usable hostlist\n" - exit 1 + send_user "\nWARNING: Did not find at least 2 idle nodes\n" + exit 0 } set hostlist "$node1,$node2" diff --git a/testsuite/expect/test9.8 b/testsuite/expect/test9.8 index f8b052e0d8308e38206fba884511394cd925cc56..b7051e9174161e76d1c8d3f12e274de15e871342 100755 --- a/testsuite/expect/test9.8 +++ b/testsuite/expect/test9.8 @@ -80,9 +80,9 @@ make_bash_script $file_in " $bin_sleep 5 ulimit -u `ulimit -u -H` for ((inx=0; inx < $task_cnt; inx++)) ; do -$srun -N1 -n1 --mem-per-cpu=$mem_per_step $bin_sleep $sleep_time & +$srun -N1 -n1 --mem=0 $bin_sleep $sleep_time & done -$srun -N1 -n1 --mem-per-cpu=$mem_per_step $bin_sleep $sleep_time +$srun -N1 -n1 --mem=0 $bin_sleep $sleep_time " # @@ -137,7 +137,7 @@ expect { } # # There could be hundreds of job steps, we don't want to see -# the details, but want to make sure that we did start a bunch +# the details, but want to make sure that we did start many # # Determine if this is AIX (for task count, federation switch # prevents each node from running more than 16 tasks) @@ -148,16 +148,34 @@ if {[test_aix]} { set desired_tasks [expr $task_cnt * 2 / 3] } -set matches 0 -while { $matches < $desired_tasks } { - log_user 0 - set matches 0 - set timeout 60 +set job_count 0 +set step_count 0 +set timeout 60 +log_user 0 +while { 1 } { exec $bin_sleep 3 - spawn $squeue --steps --user $user_name + + set job_count 0 + spawn $squeue --state R --name $job_name --user $user_name + expect { + -re "$job_name" { + incr job_count + exp_continue + } + timeout { + send_user "\nFAILURE: squeue not responding\n" + set exit_code 1 + } + eof { + wait + } + } + + set step_count 0 + spawn $squeue --steps --name $job_name --user $user_name expect { -re "sleep" { - incr matches + incr step_count exp_continue } -re "error:" { @@ -166,28 +184,34 @@ while { $matches < $desired_tasks } { exp_continue } timeout { - break + send_user "\nFAILURE: squeue not responding\n" + set exit_code 1 } eof { wait - break } } - if {$matches == 0} { + if {$step_count >= $desired_tasks || $step_count == 0} { break } + set scaled_task_cnt [expr $job_count * $desired_tasks] + if {$step_count >= $scaled_task_cnt} { + send_user "\nOnly started $job_count jobs, reducing step count target to $scaled_task_cnt\n" + set desired_tasks $scaled_task_cnt + } + } if {[test_aix]} { sleep 5 } log_user 1 -if {$matches < $desired_tasks} { - send_user "\nFAILURE: only started $matches job steps\n" +if {$step_count < $desired_tasks} { + send_user "\nFAILURE: only started $job_count jobs and $step_count steps\n" send_user " We expected at least $desired_tasks and possibly hundreds\n" set exit_code 1 } else { - send_user "\nwe found $matches job steps\n" + send_user "\nWe found $job_count jobs and $step_count steps\n" } spawn $scancel --quiet --user $user_name expect {