diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am index 62942197eb7fa6adf9e93164edfe0656c1eef6c0..16a3879f94ddf61192c74e643e33efcc2bcce053 100644 --- a/testsuite/expect/Makefile.am +++ b/testsuite/expect/Makefile.am @@ -249,6 +249,7 @@ EXTRA_DIST = \ test7.5.prog.c \ test7.6 \ test7.6.prog.c \ + test7.7 \ test7.9 \ test7.9.prog.c \ test7.10 \ diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in index c2eeaef5064af79e4e2e237d09781606e7a1b8ea..b71a9674cb9c513b8b53d0e3667bd8e46f159808 100644 --- a/testsuite/expect/Makefile.in +++ b/testsuite/expect/Makefile.in @@ -666,6 +666,7 @@ EXTRA_DIST = \ test7.5.prog.c \ test7.6 \ test7.6.prog.c \ + test7.7 \ test7.9 \ test7.9.prog.c \ test7.10 \ diff --git a/testsuite/expect/README b/testsuite/expect/README index 8c96fe626ad6356eebb892f99a2c50d59a625bfd..651d9221418538d18232d6a0d6f67dedafb4f269 100644 --- a/testsuite/expect/README +++ b/testsuite/expect/README @@ -353,7 +353,7 @@ test7.4 Test of TotalView operation with srun, with and without bulk transfer. test7.5 Test of TotalView termination logic for srun. test7.6 Test of TotalView operation with sattach -test7.7 (removed) +test7.7 Test of resource allocation layout. test7.8 (removed) test7.9 Test that no files are open in spawned tasks (except stdin, stdout, and stderr) to ensure successful checkpoint/restart. diff --git a/testsuite/expect/globals b/testsuite/expect/globals index 075a90b1ff18809cfaf868f32402aa6b29b397ae..21b2c94a10eba043f6d9ee2fae6d4bb9c95edebd 100755 --- a/testsuite/expect/globals +++ b/testsuite/expect/globals @@ -2015,6 +2015,98 @@ proc test_select_type { } { return $type } +################################################################ +# +# Proc: get_select_type_params +# +# Purpose: Determine SelectTypeParameters being used for a +# given partition +# +# Returns string containing SelectTypeParameters +# +################################################################ +proc get_select_type_params { partition } { + global scontrol bin_bash bin_grep alpha_numeric_comma + + log_user 0 + set params "" + if {[string compare $partition ""]} { + spawn -noecho $bin_bash -c "exec $scontrol show part $partition | $bin_grep SelectTypeParameters" + expect { + -re "SelectTypeParameters *= *NONE" { + exp_continue + } + -re "SelectTypeParameters *= *($alpha_numeric_comma)" { + set params $expect_out(1,string) + exp_continue + } + eof { + wait + } + } + } + if { [string compare params ""] } { + spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SelectTypeParameters" + expect { + -re "SelectTypeParameters *= *($alpha_numeric_comma)" { + set params $expect_out(1,string) + exp_continue + } + eof { + wait + } + } + } + log_user 1 + + return $params +} + +################################################################ +# +# Proc: get_total_cpus +# +# Purpose: Return the TotalCPUs count for a given partition +# +# Returns string containing SelectTypeParameters +# +################################################################ +proc get_total_cpus { partition } { + global scontrol bin_bash bin_grep number + + log_user 0 + set total_cpus 0 + if {[string compare $partition ""]} { + spawn -noecho $bin_bash -c "exec $scontrol show part $partition" + expect { + -re "TotalCPUs *= *($number)" { + set total_cpus $expect_out(1,string) + exp_continue + } + eof { + wait + } + } + } + if { $total_cpus < 1 } { + spawn -noecho $bin_bash -c "exec $scontrol show part" + expect { + -re "TotalCPUs *= *($number)" { + if { $total_cpus < 1 } { + set total_cpus $expect_out(1,string) + } + exp_continue + } + eof { + wait + } + } + } + log_user 1 + + return $total_cpus +} + ################################################################ # # Proc: test_select_type_params diff --git a/testsuite/expect/test7.7 b/testsuite/expect/test7.7 new file mode 100755 index 0000000000000000000000000000000000000000..509b8a6d50ca839dc34c41a9e9a3af0f5cf9ba46 --- /dev/null +++ b/testsuite/expect/test7.7 @@ -0,0 +1,204 @@ +#!/usr/bin/env expect +############################################################################ +# Purpose: Test of SLURM functionality +# Test of resource allocation layout. Specifically make sure that +# no excess CPUs are allocate to the job. See bug 6274. +# +# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR +# "FAILURE: ..." otherwise with an explanation of the failure, OR +# anything else indicates a failure mode that must be investigated. +############################################################################ +# Copyright (C) 2019 SchedMD LLC +# Written by Morris Jette +# +# This file is part of SLURM, a resource management program. +# For details, see <https://slurm.schedmd.com/>. +# Please also read the included file: DISCLAIMER. +# +# SLURM is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with SLURM; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +############################################################################ +source ./globals + +set test_id "7.7" +set exit_code 0 +set file_in "test$test_id.input" +set file_out "test$test_id.output" +set job_name "test$test_id" +set job_id 0 +set max_job_state_delay 10 + +proc test_alloc_size { task_cnt } { + global bin_cat bin_rm file_in file_out sbatch job_name number + global alloc_unit_num exit_code max_tasks max_job_state_delay + + exec $bin_rm -f $file_out + set job_id 0 + spawn $sbatch -t1 -o $file_out -J $job_name -n $task_cnt $file_in + expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\nFAILURE: sbatch not responding\n" + set exit_code 1 + } + eof { + wait + } + } + if {[wait_for_job $job_id "DONE"] != 0} { + send_user "\nWARNING: job $job_id did not complete\n" + cancel_job $job_id + set max_tasks $task_cnt + return + } + if {[wait_for_file $file_out] != 0} { + send_user "\nFAILURE: no output file\n" + set exit_code 1 + return + } + set num_cpus -1 + set num_nodes -1 + set threads_per_core 1 + set cores_per_socket 1 + set sockets_per_node 1 + spawn $bin_cat $file_out + expect { + -re "NumCPUs=($number)" { + set num_cpus $expect_out(1,string) + exp_continue + } + -re "SLURM_NNODES=($number)" { + set num_nodes $expect_out(1,string) + exp_continue + } + -re "ThreadsPerCore=($number)" { + if {$threads_per_core < $expect_out(1,string)} { + set threads_per_core $expect_out(1,string) + } + exp_continue + } + eof { + wait + } + } + log_user 0 + spawn $bin_cat $file_out + expect { + -re "CoresPerSocket=($number)" { + if {$cores_per_socket < $expect_out(1,string)} { + set cores_per_socket $expect_out(1,string) + } + exp_continue + } + eof { + wait + } + } + spawn $bin_cat $file_out + expect { + -re "Sockets=($number)" { + if {$sockets_per_node < $expect_out(1,string)} { + set sockets_per_node $expect_out(1,string) + } + exp_continue + } + eof { + wait + } + } + log_user 1 + +# Determine largest allocation unit + if {$alloc_unit_num == 4} { + set alloc_unit 1 + } elseif {$alloc_unit_num == 3} { + set alloc_unit $threads_per_core + } elseif {$alloc_unit_num == 2} { + set alloc_unit [expr $threads_per_core * $cores_per_socket] + } elseif {$alloc_unit_num == 1} { + set alloc_unit [expr $threads_per_core * $cores_per_socket * $sockets_per_node] + } else { + send_user "\nFAILURE: Invalid allocation unit: $alloc_unit_num\n" + set exit_code 1 + set alloc_unit 0 + } + + set max_alloc [expr $task_cnt + (($alloc_unit - 1) * $num_nodes)] + if {$num_cpus > $max_alloc} { + send_user "\nFAILURE: Job with $task_cnt tasks allocated too many CPUs ($num_cpus > $max_alloc)\n" + send_user "TASKS:$task_cnt\n" + send_user "CPUS:$num_cpus\n" + send_user "NODES:$num_nodes\n" + send_user "MAX_THREADS_PER_CORE:$threads_per_core\n" + send_user "MAX_CORES_PER_SOCKET:$cores_per_socket\n" + send_user "MAX_SOCKETS_PER_NODE:$sockets_per_node\n" + send_user "ALLOCATION_UNIT:$alloc_unit\n" + set exit_code 1 + } +} + +print_header $test_id + +set select_type [test_select_type] +set partition [default_partition] +set total_cpus [get_total_cpus $partition] +set select_type_param [get_select_type_params $partition] +if {![string compare $select_type "linear"]} { + set alloc_unit_str "NODE" + set alloc_unit_num 1 +} elseif { [string first "CR_SOCKET" $select_type_param] != -1} { + set alloc_unit_str "SOCKET" + set alloc_unit_num 2 +} elseif { [string first "CR_CORE" $select_type_param] != -1} { + set alloc_unit_str "CORE" + set alloc_unit_num 3 +} else { + set alloc_unit_str "CPU" + set alloc_unit_num 4 +} +set max_tasks [get_total_cpus $partition] +send_user "\nResource allocation unit: $alloc_unit_str\n" +send_user "CPUs in default partition: $max_tasks\n\n" +if {$max_tasks > 32} { + set max_tasks 20 +} + +make_bash_script $file_in " +$scontrol -dd show job \$SLURM_JOB_ID | grep NumCPUs= +$scontrol -dd show job \$SLURM_JOB_ID | grep CPU_IDs= +$bin_echo '' +env | grep SLURM_NNODES= +env | grep SLURM_TASKS_PER_NODE= +env | grep SLURM_JOB_CPUS_PER_NODE= +$bin_echo '' +$scontrol show node \$SLURM_JOB_NODELIST | grep -v Features | grep -v Gres | \ +grep -v CPUAlloc | grep -v NodeAddr | grep -v OS | grep -v Partitions | \ +grep -v CfgTRES | grep -v AllocTRES | grep -v BootTime | grep -v Watts +" + +for {set inx 1} {$inx < $max_tasks && $exit_code == 0} {incr inx} { + test_alloc_size $inx +} + +# +# Clean up and exit +# +if {$exit_code == 0} { + exec $bin_rm -f $file_in $file_out + send_user "\nSUCCESS\n" +} +exit $exit_code