diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am index 82dcd978c82c239d0cef8767609465fee8fa33b1..2b40d4a701f7cb0ece9007db02bfca6922a66e27 100644 --- a/testsuite/expect/Makefile.am +++ b/testsuite/expect/Makefile.am @@ -205,6 +205,8 @@ EXTRA_DIST = \ test8.20 \ test8.21 \ test8.21.bash \ + test8.22 \ + test8.23 \ test9.1 \ test9.2 \ test9.3 \ diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in index d8ca829822188a88b81f7742e009406ba817d6ff..bb98c848174c5bbced51a529e488c14c5c2423ec 100644 --- a/testsuite/expect/Makefile.in +++ b/testsuite/expect/Makefile.in @@ -485,6 +485,8 @@ EXTRA_DIST = \ test8.20 \ test8.21 \ test8.21.bash \ + test8.22 \ + test8.23 \ test9.1 \ test9.2 \ test9.3 \ diff --git a/testsuite/expect/README b/testsuite/expect/README index ea1f5ab954f21db5b1eabf1f685201e515e93c92..0e20828eb4e0cfdf97d699453915ba249812b83d 100644 --- a/testsuite/expect/README +++ b/testsuite/expect/README @@ -333,6 +333,10 @@ test8.20 Bluegene/Q only: Test that job step allocations are a valid size and within the job's allocation test8.21 Bluegene/Q only: Test that multple job step allocations are properly packed within the job's allocation +test8.22 Bluegene/Q only: Stress test of running many job step allocations + within the job's allocation +test8.23 Bluegene/Q only: Test that multple jobs allocations are properly + packed within a midplane test9.# System stress testing. Exercises all commands and daemons. diff --git a/testsuite/expect/test8.21 b/testsuite/expect/test8.21 index 515498dd527ed87c49c76085f6036e2d417c73c5..ed271f170f28b97c4e9ecfc079417d7420009e30 100755 --- a/testsuite/expect/test8.21 +++ b/testsuite/expect/test8.21 @@ -172,7 +172,7 @@ expect { if [info exists use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)] { incr use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) } else { - send_user "\nFAILURE: invalid step cnode allocation at " + send_user "\nFAILURE: invalid step c-node allocation at " send_user "\[$dim1,$dim2,$dim3,$dim4,$dim5\]/" set exit_code 1 } @@ -193,7 +193,7 @@ expect { if [info exists use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)] { incr use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) } else { - send_user "\nFAILURE: invalid step cnode allocation at " + send_user "\nFAILURE: invalid step c-node allocation at " send_user "\[$dim1,$dim2,$dim3,$dim4,$dim5\]\n" set exit_code 1 } @@ -217,7 +217,7 @@ for {set dim1 $job_start1} {$dim1 <= $job_fini1} {incr dim1} { for {set dim4 $job_start4} {$dim4 <= $job_fini4} {incr dim4} { for {set dim5 $job_start5} {$dim5 <= $job_fini5} {incr dim5} { if {$use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) != 1} { - send_user "\nFAILURE: cnode at \[$dim1,$dim2,$dim3,$dim4,$dim5\] " + send_user "\nFAILURE: c-node at \[$dim1,$dim2,$dim3,$dim4,$dim5\] " send_user "allocated $use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) times\n" set exit_code 1 } @@ -273,7 +273,7 @@ expect { if [info exists use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)] { incr use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) } else { - send_user "\nFAILURE: invalid step cnode allocation at " + send_user "\nFAILURE: invalid step c-node allocation at " send_user "\[$dim1,$dim2,$dim3,$dim4,$dim5\]/" set exit_code 1 } @@ -294,7 +294,7 @@ expect { if [info exists use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)] { incr use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) } else { - send_user "\nFAILURE: invalid step cnode allocation at " + send_user "\nFAILURE: invalid step c-node allocation at " send_user "\[$dim1,$dim2,$dim3,$dim4,$dim5\]\n" set exit_code 1 } @@ -320,7 +320,7 @@ for {set dim1 $job_start1} {$dim1 <= $job_fini1} {incr dim1} { for {set dim4 $job_start4} {$dim4 <= $job_fini4} {incr dim4} { for {set dim5 $job_start5} {$dim5 <= $job_fini5} {incr dim5} { if {$use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) > 1} { - send_user "\nFAILURE: cnode at \[$dim1,$dim2,$dim3,$dim4,$dim5\] " + send_user "\nFAILURE: c-node at \[$dim1,$dim2,$dim3,$dim4,$dim5\] " send_user "allocated $use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) times\n" set exit_code 1 } diff --git a/testsuite/expect/test8.22 b/testsuite/expect/test8.22 new file mode 100755 index 0000000000000000000000000000000000000000..a0407360ebbda6c0ffa7107acd0eecf1d899d508 --- /dev/null +++ b/testsuite/expect/test8.22 @@ -0,0 +1,163 @@ +#!/usr/bin/expect +############################################################################ +# Purpose: Test of SLURM functionality +# Bluegene/Q only: Stress test of running many job step allocations +# within the job's allocation +# +# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR +# "FAILURE: ..." otherwise with an explanation of the failure, OR +# anything else indicates a failure mode that must be investigated. +############################################################################ +# Copyright (C) 2011 SchedMD LLC +# Written by Morris Jette <jette@schedmd.gov> +# +# This file is part of SLURM, a resource management program. +# For details, see <http://www.schedmd.com/slurmdocs/>. +# Please also read the included file: DISCLAIMER. +# +# SLURM is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with SLURM; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +############################################################################ +source ./globals + +set test_id "8.22" +set exit_code 0 +set file_in "test$test_id.in" +set job_id 0 +set job_size 32 + + +print_header $test_id + +if {([test_bluegene] == 0) || [string compare [get_bluegene_type] "Q"]} { + send_user "\nWARNING: This test is only compatable with Bluegene/Q systems\n" + exit $exit_code +} + +# +# Spawn a job via salloc +# +set matches 0 +set timeout $max_job_delay +set salloc_pid [spawn $salloc -N$job_size -t1 $bin_bash] +expect { + -re "Granted job allocation ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + -re $prompt { + #send_user "Job initiated\n" + } + timeout { + send_user "\nFAILURE: salloc not responding\n" + if {$job_id != 0} { + cancel_job $job_id + } + slow_kill [expr 0 - $salloc_pid] + exit 1 + } +} + +if {$job_id == 0} { + send_user "\nFAILURE: did not get job_id\n" + exit 1 +} + +# +# Determine the job's allocation dimensions +# +set job_start1 -1 +send "$scontrol show job $job_id\r" +expect { + -re "BP_List=($alpha_numeric).($digit)($digit)($digit)($digit)($digit)x($digit)($digit)($digit)($digit)($digit)" { + set job_prefix $expect_out(1,string) + set job_start1 $expect_out(2,string) + set job_start2 $expect_out(3,string) + set job_start3 $expect_out(4,string) + set job_start4 $expect_out(5,string) + set job_start5 $expect_out(6,string) + set job_fini1 $expect_out(7,string) + set job_fini2 $expect_out(8,string) + set job_fini3 $expect_out(9,string) + set job_fini4 $expect_out(10,string) + set job_fini5 $expect_out(11,string) + exp_continue + } + -re $prompt { + #break + } + timeout { + send_user "\nFAILURE: scontrol not responding\n" + set exit_code 1 + } +} +if {$job_start1 == -1} { + send_user "\nFAILURE: did not get job dimensions\n" + cancel_job $job_id + exit 1 +} +send_user "\nJob allocation\n" +send_user "prefix: $job_prefix\n" +send_user "dim 1: $job_start1 to $job_fini1 " +send_user "dim 2: $job_start2 to $job_fini2 " +send_user "dim 3: $job_start3 to $job_fini3 " +send_user "dim 4: $job_start4 to $job_fini4 " +send_user "dim 5: $job_start5 to $job_fini5\n" + +set job_dim1 [expr $job_fini1 - $job_start1 + 1] +set job_dim2 [expr $job_fini2 - $job_start2 + 1] +set job_dim3 [expr $job_fini3 - $job_start3 + 1] +set job_dim4 [expr $job_fini4 - $job_start4 + 1] +set job_dim5 [expr $job_fini5 - $job_start5 + 1] +set actual_job_size [expr $job_dim1 * $job_dim2 * $job_dim3 * $job_dim4 * $job_dim5] +send_user "size: $actual_job_size c-nodes\n" +if {$actual_job_size < $job_size} { + send_user "\nFAILURE: job allocation too small ($actual_job_size < $job_size)\n" + cancel_job $job_id + exit 1 +} +if {$actual_job_size != $job_size} { +# This is a legitimate condition. A request for 5 c-nodes requires +# at least 6 c-nodes (3x2x1x1x1). + send_user "\nWARNING: job allocation too large ($actual_job_size != $job_size)\n" +} + +make_bash_script $file_in " +for ((inx=0; inx<$actual_job_size; inx++)) ; do + $srun -N4 sleep 1 & + $srun -N2 sleep 1 & + $srun -N1 sleep 1 & +done +wait" + +send "./$file_in\r" +expect { + -re $prompt { + send "exit\r" + exp_continue + } + timeout { + send_user "\nFAILURE: job not responding\n" + set exit_code 1 + } +} + +if {$exit_code == 0} { + exec rm -f $file_in + send_user "\nSUCCESS\n" +} else { + cancel_job $job_id +} + +exit $exit_code diff --git a/testsuite/expect/test8.23 b/testsuite/expect/test8.23 new file mode 100755 index 0000000000000000000000000000000000000000..6ec366caf773b0f7c9514c9c56ad18e0e91b19f5 --- /dev/null +++ b/testsuite/expect/test8.23 @@ -0,0 +1,283 @@ +#!/usr/bin/expect +############################################################################ +# Purpose: Test of SLURM functionality +# Bluegene/Q only: Test that multple jobs allocations are properly +# packed within a midplane +# +# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR +# "FAILURE: ..." otherwise with an explanation of the failure, OR +# anything else indicates a failure mode that must be investigated. +############################################################################ +# Copyright (C) 2011 SchedMD LLC +# Written by Morris Jette <jette@schedmd.gov> +# +# This file is part of SLURM, a resource management program. +# For details, see <http://www.schedmd.com/slurmdocs/>. +# Please also read the included file: DISCLAIMER. +# +# SLURM is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with SLURM; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +############################################################################ +source ./globals + +set test_id "8.23" +set exit_code 0 +set file_in "test$test_id.in" +set job_size 32 + + +print_header $test_id + +if {([test_bluegene] == 0) || [string compare [get_bluegene_type] "Q"]} { + send_user "\nWARNING: This test is only compatable with Bluegene/Q systems\n" + exit $exit_code +} + +make_bash_script $file_in "sleep 300" + +# +# Submit a sub-midplane job +# +set matches 0 +set timeout $max_job_delay +set job_id(1) 0 +set sbatch_pid [spawn $sbatch -N$job_size -t2 $file_in] +expect { + -re "Submitted batch job ($number)" { + set job_id(1) $expect_out(1,string) + exp_continue + } + timeout { + send_user "\nFAILURE: sbatch not responding\n" + if {$job_id(1) != 0} { + cancel_job $job_id(1) + } + slow_kill [expr 0 - $sbatch_pid] + exit 1 + } + eof { + wait + } +} +if {$job_id(1) == 0} { + send_user "\nFAILURE: did not get job_id\n" + exit 1 +} + +# +# Determine the job's midplane name and allocation dimensions +# +if {[wait_for_job $job_id(1) RUNNING] != 0} { + send_user "\nFAILURE: error starting job $job_id(1)\n" + cancel_job $job_id + exit 1 +} +set job_start(1,1) -1 +spawn $scontrol show job $job_id(1) +expect { + -re "BP_List=($alpha_numeric).($digit)($digit)($digit)($digit)($digit)x($digit)($digit)($digit)($digit)($digit)" { + set job_prefix(1) $expect_out(1,string) + set job_start(1,1) $expect_out(2,string) + set job_start(1,2) $expect_out(3,string) + set job_start(1,3) $expect_out(4,string) + set job_start(1,4) $expect_out(5,string) + set job_start(1,5) $expect_out(6,string) + set job_fini(1,1) $expect_out(7,string) + set job_fini(1,2) $expect_out(8,string) + set job_fini(1,3) $expect_out(9,string) + set job_fini(1,4) $expect_out(10,string) + set job_fini(1,5) $expect_out(11,string) + exp_continue + } + timeout { + send_user "\nFAILURE: scontrol not responding\n" + set exit_code 1 + } + eof { + wait + } +} +if {$job_start(1,1) == -1} { + send_user "\nFAILURE: did not get job dimensions\n" + cancel_job $job_id(1) + exit 1 +} +send_user "\nJob allocation\n" +send_user "prefix: $job_prefix(1)\n" +send_user "dim 1: $job_start(1,1) to $job_fini(1,1) " +send_user "dim 2: $job_start(1,2) to $job_fini(1,2) " +send_user "dim 3: $job_start(1,3) to $job_fini(1,3) " +send_user "dim 4: $job_start(1,4) to $job_fini(1,4) " +send_user "dim 5: $job_start(1,5) to $job_fini(1,5)\n" + +set job_dim(1,1) [expr $job_fini(1,1) - $job_start(1,1) + 1] +set job_dim(1,2) [expr $job_fini(1,2) - $job_start(1,2) + 1] +set job_dim(1,3) [expr $job_fini(1,3) - $job_start(1,3) + 1] +set job_dim(1,4) [expr $job_fini(1,4) - $job_start(1,4) + 1] +set job_dim(1,5) [expr $job_fini(1,5) - $job_start(1,5) + 1] +set actual_job_size [expr $job_dim(1,1) * $job_dim(1,2) * $job_dim(1,3) * $job_dim(1,4) * $job_dim(1,5)] +send_user "size: $actual_job_size c-nodes\n" +if {$actual_job_size < $job_size} { + send_user "\nFAILURE: job allocation too small ($actual_job_size < $job_size)\n" + cancel_job $job_id + exit 1 +} +if {$actual_job_size != $job_size} { +# This is a legitimate condition. A request for 5 c-nodes requires +# at least 6 c-nodes (3x2x1x1x1). + send_user "\nWARNING: job allocation too large ($actual_job_size != $job_size)\n" +} + +# +# Submit more jobs to fill that midplane +# +set job_count [expr 512 / $actual_job_size] +for {set inx 2} {$inx <= $job_count} {incr inx} { + set sbatch_pid [spawn $sbatch -N$job_size -w $job_prefix(1) -t2 $file_in] + expect { + -re "Submitted batch job ($number)" { + set job_id($inx) $expect_out(1,string) + exp_continue + } + timeout { + send_user "\nFAILURE: sbatch not responding\n" + if {$job_id($inx) != 0} { + cancel_job $job_id($inx) + } + slow_kill [expr 0 - $sbatch_pid] + exit 1 + } + eof { + wait + } + } + if {$job_id($inx) == 0} { + send_user "\nFAILURE: did not get job_id\n" + exit 1 + } +} + +# +# Give the new jobs a chance to start and see what resources they are allocated +# +sleep 15 +for {set inx 2} {$inx <= $job_count} {incr inx} { + set job_start($inx,1) -1 + spawn $scontrol show job $job_id($inx) + expect { + -re "BP_List=($alpha_numeric).($digit)($digit)($digit)($digit)($digit)x($digit)($digit)($digit)($digit)($digit)" { + set job_prefix($inx) $expect_out(1,string) + set job_start($inx,1) $expect_out(2,string) + set job_start($inx,2) $expect_out(3,string) + set job_start($inx,3) $expect_out(4,string) + set job_start($inx,4) $expect_out(5,string) + set job_start($inx,5) $expect_out(6,string) + set job_fini($inx,1) $expect_out(7,string) + set job_fini($inx,2) $expect_out(8,string) + set job_fini($inx,3) $expect_out(9,string) + set job_fini($inx,4) $expect_out(10,string) + set job_fini($inx,5) $expect_out(11,string) + exp_continue + } + timeout { + send_user "\nFAILURE: scontrol not responding\n" + set exit_code 1 + } + eof { + wait + } + } +} + +# +# Build an array to count the job's c-nodes which have been allocated to our jobs +# +for {set dim1 0} {$dim1 <= 3} {incr dim1} { + for {set dim2 0} {$dim2 <= 3} {incr dim2} { + for {set dim3 0} {$dim3 <= 3} {incr dim3} { + for {set dim4 0} {$dim4 <= 3} {incr dim4} { + for {set dim5 0} {$dim5 <= 1} {incr dim5} { + set use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) 0 + } + } + } + } +} + +# +# Validate the allocated midplane name and count specific allocated c-nodes +# +for {set inx 1} {$inx <= $job_count} {incr inx} { + if {$job_start($inx,1) == -1} { + continue + } + if {[string compare $job_prefix(1) $job_prefix($inx)]} { + send_user "\nFAILURE: job $job_id($inx) is running on " + send_user "midplane job_prefix($inx) instead of $job_prefix(1)\n" + set exit_code 1 + continue + } + for {set dim1 $job_start($inx,1)} {$dim1 <= $job_fini($inx,1)} {incr dim1} { + for {set dim2 $job_start($inx,2)} {$dim2 <= $job_fini($inx,2)} {incr dim2} { + for {set dim3 $job_start($inx,3)} {$dim3 <= $job_fini($inx,3)} {incr dim3} { + for {set dim4 $job_start($inx,4)} {$dim4 <= $job_fini($inx,4)} {incr dim4} { + for {set dim5 $job_start($inx,5)} {$dim5 <= $job_fini($inx,5)} {incr dim5} { + if [info exists use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)] { + incr use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) + } else { + send_user "\nFAILURE: invalid c-node allocation for job " + send_user "$job_id($inx) at \[$dim1,$dim2,$dim3,$dim4,$dim5\]/" + set exit_code 1 + } + } + } + } + } + } +} + +# +# Test that no c-nodes have been allocated more than once to our jobs +# This is an independent loop so that we might check for unallocated c-nodes +# +set unused_count 0 +for {set dim1 0} {$dim1 <= 3} {incr dim1} { + for {set dim2 0} {$dim2 <= 3} {incr dim2} { + for {set dim3 0} {$dim3 <= 3} {incr dim3} { + for {set dim4 0} {$dim4 <= 3} {incr dim4} { + for {set dim5 0} {$dim5 <= 1} {incr dim5} { + if {$use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) > 1} { + send_user "\nFAILURE: c-node at \[$dim1,$dim2,$dim3,$dim4,$dim5\] " + send_user "allocated $use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) times\n" + set exit_code 1 + } elseif {$use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) == 0} { + incr unused_count + } + } + } + } + } +} +send_user "\nNOTE: $unused_count c-nodes on midplane $job_prefix(1) were unused by our jobs\n" + +for {set inx 1} {$inx <= $job_count} {incr inx} { + cancel_job $job_id($inx) +} + +if {$exit_code == 0} { + exec rm -f $file_in + send_user "\nSUCCESS\n" +} + +exit $exit_code