Skip to content
Snippets Groups Projects
Commit d49fd8f9 authored by Morris Jette's avatar Morris Jette
Browse files

Bluegene/Q test enhancements

These changes more thouroughly test Bluegene/Q job step placement
algorithms and validate several recent bug fixes in the SLURM code.
parent ac6e649d
No related branches found
No related tags found
No related merge requests found
......@@ -139,9 +139,10 @@ if {$actual_job_size != $job_size} {
# NOTE: Change this to increment step size by one once SLURM logic can
# automatically increase step size as needed
#
set timeout 60
set step_id 0
set max_step_size 0
for {set step_size 1 } {$step_size <= $job_size} {set step_size [expr $step_size * 2]} {
for {set step_size 1 } {$step_size <= $job_size} {set step_size [expr $step_size + 1]} {
send_user "\nRunning step $job_id.$step_id at size $step_size\n"
set step_start1 -1
send "$srun -N$step_size --test-only $scontrol show step $job_id.$step_id\r"
......
......@@ -149,8 +149,8 @@ for {set dim1 $job_start1} {$dim1 <= $job_fini1} {incr dim1} {
}
}
set timeout 30
send "./$file_prog $srun $squeue $job_id $actual_job_size\r"
set timeout 60
send "./$file_prog $srun $squeue $job_id $actual_job_size 1\r"
expect {
-re "BP_List=($alpha_numeric).($digit)($digit)($digit)($digit)($digit)x($digit)($digit)($digit)($digit)($digit)" {
set step_prefix $expect_out(1,string)
......@@ -200,8 +200,7 @@ expect {
exp_continue
}
-re $prompt {
send "exit\r"
exp_continue
#break
}
timeout {
send_user "\nFAILURE: job not responding\n"
......@@ -228,6 +227,109 @@ for {set dim1 $job_start1} {$dim1 <= $job_fini1} {incr dim1} {
}
}
if {$exit_code == 0} {
send_user "\nSo far, so good...\n\n"
}
#
# Clear the count the job's c-nodes which have been allocated to steps
#
for {set dim1 $job_start1} {$dim1 <= $job_fini1} {incr dim1} {
for {set dim2 $job_start2} {$dim2 <= $job_fini2} {incr dim2} {
for {set dim3 $job_start3} {$dim3 <= $job_fini3} {incr dim3} {
for {set dim4 $job_start4} {$dim4 <= $job_fini4} {incr dim4} {
for {set dim5 $job_start5} {$dim5 <= $job_fini5} {incr dim5} {
set use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) 0
}
}
}
}
}
#
# This is a randomized variation on the above logic and includes a full
# allocation job step. Some job steps may not start due to packing issues
#
set timeout 60
send "./$file_prog $srun $squeue $job_id $actual_job_size 2\r"
expect {
-re "BP_List=($alpha_numeric).($digit)($digit)($digit)($digit)($digit)x($digit)($digit)($digit)($digit)($digit)" {
set step_prefix $expect_out(1,string)
set step_start1 $expect_out(2,string)
set step_start2 $expect_out(3,string)
set step_start3 $expect_out(4,string)
set step_start4 $expect_out(5,string)
set step_start5 $expect_out(6,string)
set step_fini1 $expect_out(7,string)
set step_fini2 $expect_out(8,string)
set step_fini3 $expect_out(9,string)
set step_fini4 $expect_out(10,string)
set step_fini5 $expect_out(11,string)
for {set dim1 $step_start1} {$dim1 <= $step_fini1} {incr dim1} {
for {set dim2 $step_start2} {$dim2 <= $step_fini2} {incr dim2} {
for {set dim3 $step_start3} {$dim3 <= $step_fini3} {incr dim3} {
for {set dim4 $step_start4} {$dim4 <= $step_fini4} {incr dim4} {
for {set dim5 $step_start5} {$dim5 <= $step_fini5} {incr dim5} {
if [info exists use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)] {
incr use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)
} else {
send_user "\nFAILURE: invalid step cnode allocation at "
send_user "\[$dim1,$dim2,$dim3,$dim4,$dim5\]/"
set exit_code 1
}
}
}
}
}
}
exp_continue
}
-re "BP_List=($alpha_numeric).($digit)($digit)($digit)($digit)($digit)" {
set step_prefix $expect_out(1,string)
set dim1 $expect_out(2,string)
set dim2 $expect_out(3,string)
set dim3 $expect_out(4,string)
set dim4 $expect_out(5,string)
set dim5 $expect_out(6,string)
if [info exists use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)] {
incr use_cnt($dim1,$dim2,$dim3,$dim4,$dim5)
} else {
send_user "\nFAILURE: invalid step cnode allocation at "
send_user "\[$dim1,$dim2,$dim3,$dim4,$dim5\]\n"
set exit_code 1
}
exp_continue
}
-re $prompt {
send_user "\nNOTE: Step create errors due to busy nodes are expected\n"
send "exit\r"
exp_continue
}
timeout {
send_user "\nFAILURE: job not responding\n"
set exit_code 1
}
}
#
# Test that each of the job's c-nodes have been allocated no more than once to some step
#
for {set dim1 $job_start1} {$dim1 <= $job_fini1} {incr dim1} {
for {set dim2 $job_start2} {$dim2 <= $job_fini2} {incr dim2} {
for {set dim3 $job_start3} {$dim3 <= $job_fini3} {incr dim3} {
for {set dim4 $job_start4} {$dim4 <= $job_fini4} {incr dim4} {
for {set dim5 $job_start5} {$dim5 <= $job_fini5} {incr dim5} {
if {$use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) > 1} {
send_user "\nFAILURE: cnode at \[$dim1,$dim2,$dim3,$dim4,$dim5\] "
send_user "allocated $use_cnt($dim1,$dim2,$dim3,$dim4,$dim5) times\n"
set exit_code 1
}
}
}
}
}
}
if {$exit_code == 0} {
send_user "\nSUCCESS\n"
} else {
......
#!/bin/bash
if [ $# -ne 4 ]; then
echo "test8.21.bash <srun_path> <squeue_path> <job_id> <job_size>"
if [ $# -ne 5 ]; then
echo "test8.21.bash <srun_path> <squeue_path> <job_id> <job_size> <mode:1|2?"
exit 1
fi
srun=$1
squeue=$2
job_id=$3
job_size=$4
test_mode=$5
$srun -N1 --test-only /bin/true
sleep 5
delay_time=1
while [ $delay_time -le 60 ]
do
$srun -N1 --test-only --immediate /bin/true
rc=$?
if [ $rc -eq 0 ]
then
break
fi
sleep $delay_time
delay_time=`expr $delay_time + 1`
done
if [ $test_mode -gt 1 ]
then
job_size=`expr $job_size + $job_size`
sleep_time=0
else
sleep_time=1
fi
while [ $job_size -ge 2 ]
do
job_size=`expr $job_size / 2`
$srun -N$job_size --test-only sleep 50 &
sleep 1
sleep $sleep_time
done
$srun -N1 --test-only sleep 50 &
sleep 5
$squeue --jobs=$job_id --steps --noheader --format='Step_ID=%i BP_List=%N'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment