Skip to content
Snippets Groups Projects
Commit 31f447eb authored by Morris Jette's avatar Morris Jette
Browse files

Bluegene/Q - Add test for how job steps are packed onto job's cnodes

parent 37026b24
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/expect
############################################################################
# Purpose: Test of SLURM functionality
# Bluegene/Q only: Test that multple job step allocations are
# properly packed within the job's allocation
#
# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR
# "FAILURE: ..." otherwise with an explanation of the failure, OR
# anything else indicates a failure mode that must be investigated.
############################################################################
# Copyright (C) 2011 SchedMD LLC
# Written by Morris Jette <jette@schedmd.gov>
#
# This file is part of SLURM, a resource management program.
# For details, see <http://www.schedmd.com/slurmdocs/>.
# Please also read the included file: DISCLAIMER.
#
# SLURM is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with SLURM; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set test_id "8.21"
set exit_code 0
set file_prog "test$test_id.bash"
set job_id 0
set job_size 32
print_header $test_id
if {([test_bluegene] == 0) || [string compare [get_bluegene_type] "Q"]} {
send_user "\nWARNING: This test is only compatable with Bluegene/Q systems\n"
exit $exit_code
}
#
# Spawn a job via salloc
#
set matches 0
set timeout $max_job_delay
set salloc_pid [spawn $salloc -N$job_size -t1 $bin_bash]
expect {
-re "Granted job allocation ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
-re $prompt {
#send_user "Job initiated\n"
}
timeout {
send_user "\nFAILURE: salloc not responding\n"
if {$job_id != 0} {
cancel_job $job_id
}
slow_kill [expr 0 - $salloc_pid]
exit 1
}
}
if {$job_id == 0} {
send_user "\nFAILURE: did not get job_id\n"
exit 1
}
#
# Determine the job's allocation dimensions
#
set timeout 5
set job_start1 -1
send "$scontrol show job $job_id\r"
expect {
-re "BP_List=($alpha_numeric).($digit)($digit)($digit)($digit)($digit)x($digit)($digit)($digit)($digit)($digit)" {
set job_prefix $expect_out(1,string)
set job_start1 $expect_out(2,string)
set job_start2 $expect_out(3,string)
set job_start3 $expect_out(4,string)
set job_start4 $expect_out(5,string)
set job_start5 $expect_out(6,string)
set job_fini1 $expect_out(7,string)
set job_fini2 $expect_out(8,string)
set job_fini3 $expect_out(9,string)
set job_fini4 $expect_out(10,string)
set job_fini5 $expect_out(11,string)
exp_continue
}
-re $prompt {
#break
}
timeout {
send_user "\nFAILURE: scontrol not responding\n"
set exit_code 1
}
}
if {$job_start1 == -1} {
send_user "\nFAILURE: did not get job dimensions\n"
cancel_job $job_id
exit 1
}
send_user "\nJob allocation\n"
send_user "prefix: $job_prefix\n"
send_user "dim 1: $job_start1 to $job_fini1 "
send_user "dim 2: $job_start2 to $job_fini2 "
send_user "dim 3: $job_start3 to $job_fini3 "
send_user "dim 4: $job_start4 to $job_fini4 "
send_user "dim 5: $job_start5 to $job_fini5\n"
set job_dim1 [expr $job_fini1 - $job_start1 + 1]
set job_dim2 [expr $job_fini2 - $job_start2 + 1]
set job_dim3 [expr $job_fini3 - $job_start3 + 1]
set job_dim4 [expr $job_fini4 - $job_start4 + 1]
set job_dim5 [expr $job_fini5 - $job_start5 + 1]
set actual_job_size [expr $job_dim1 * $job_dim2 * $job_dim3 * $job_dim4 * $job_dim5]
send_user "size: $actual_job_size c-nodes\n"
if {$actual_job_size < $job_size} {
send_user "\nFAILURE: job allocation too small ($actual_job_size < $job_size)\n"
cancel_job $job_id
exit 1
}
if {$actual_job_size != $job_size} {
# This is a legitimate condition. A request for 5 c-nodes requires
# at least 6 c-nodes (3x2x1x1x1).
send_user "\nWARNING: job allocation too large ($actual_job_size != $job_size)\n"
}
#
# Build an array to count the job's c-nodes which have been allocated to steps
#
for {set dim1 $job_start1} {$dim1 <= $job_fini1} {incr dim1} {
for {set dim2 $job_start2} {$dim2 <= $job_fini2} {incr dim2} {
for {set dim3 $job_start3} {$dim3 <= $job_fini3} {incr dim3} {
for {set dim4 $job_start4} {$dim4 <= $job_fini4} {incr dim4} {
for {set dim5 $job_start5} {$dim5 <= $job_fini5} {incr dim5} {
set use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) 0
}
}
}
}
}
set timeout 30
send "./$file_prog $srun $squeue $job_id $actual_job_size\r"
expect {
-re "BP_List=($alpha_numeric).($digit)($digit)($digit)($digit)($digit)x($digit)($digit)($digit)($digit)($digit)" {
set step_prefix $expect_out(1,string)
set step_start1 $expect_out(2,string)
set step_start2 $expect_out(3,string)
set step_start3 $expect_out(4,string)
set step_start4 $expect_out(5,string)
set step_start5 $expect_out(6,string)
set step_fini1 $expect_out(7,string)
set step_fini2 $expect_out(8,string)
set step_fini3 $expect_out(9,string)
set step_fini4 $expect_out(10,string)
set step_fini5 $expect_out(11,string)
for {set dim1 $step_start1} {$dim1 <= $step_fini1} {incr dim1} {
for {set dim2 $step_start2} {$dim2 <= $step_fini2} {incr dim2} {
for {set dim3 $step_start3} {$dim3 <= $step_fini3} {incr dim3} {
for {set dim4 $step_start4} {$dim4 <= $step_fini4} {incr dim4} {
for {set dim5 $step_start5} {$dim5 <= $step_fini5} {incr dim5} {
if [info exists use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)] {
incr use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)
} else {
send_user "\nFAILURE: invalid step cnode allocation at "
send_user "\[$dim1,$dim2,$dim3,$dim4,$dim5\]/"
set exit_code 1
}
}
}
}
}
}
exp_continue
}
-re "BP_List=($alpha_numeric).($digit)($digit)($digit)($digit)($digit)" {
set step_prefix $expect_out(1,string)
set dim1 $expect_out(2,string)
set dim2 $expect_out(3,string)
set dim3 $expect_out(4,string)
set dim4 $expect_out(5,string)
set dim5 $expect_out(6,string)
if [info exists use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)] {
incr use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)
} else {
send_user "\nFAILURE: invalid step cnode allocation at "
send_user "\[$dim1,$dim2,$dim3,$dim4,$dim5\]\n"
set exit_code 1
}
exp_continue
}
-re $prompt {
send "exit\r"
exp_continue
}
timeout {
send_user "\nFAILURE: job not responding\n"
set exit_code 1
}
}
#
# Test that each of the job's c-nodes have been allocated once to some step
#
for {set dim1 $job_start1} {$dim1 <= $job_fini1} {incr dim1} {
for {set dim2 $job_start2} {$dim2 <= $job_fini2} {incr dim2} {
for {set dim3 $job_start3} {$dim3 <= $job_fini3} {incr dim3} {
for {set dim4 $job_start4} {$dim4 <= $job_fini4} {incr dim4} {
for {set dim5 $job_start5} {$dim5 <= $job_fini5} {incr dim5} {
if {$use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) != 1} {
send_user "\nFAILURE: cnode at \[$dim1,$dim2,$dim3,$dim4,$dim5\] "
send_user "allocated $use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) times\n"
set exit_code 1
}
}
}
}
}
}
if {$exit_code == 0} {
send_user "\nSUCCESS\n"
} else {
cancel_job $job_id
}
exit $exit_code
#!/bin/bash
if [ $# -ne 4 ]; then
echo "test8.21.bash <srun_path> <squeue_path> <job_id> <job_size>"
exit 1
fi
srun=$1
squeue=$2
job_id=$3
job_size=$4
$srun -N1 --test-only /bin/true
sleep 5
while [ $job_size -ge 2 ]
do
job_size=`expr $job_size / 2`
$srun -N$job_size --test-only sleep 50 &
sleep 1
done
$srun -N1 --test-only sleep 50 &
sleep 5
$squeue --jobs=$job_id --steps --noheader --format='Step_ID=%i BP_List=%N'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment