Skip to content
Snippets Groups Projects
Commit f7a9aef8 authored by Danny Auble's avatar Danny Auble
Browse files

fix to work on bluegene system

parent cadb943c
No related branches found
No related tags found
No related merge requests found
...@@ -982,6 +982,37 @@ proc test_bluegene { } { ...@@ -982,6 +982,37 @@ proc test_bluegene { } {
return $bluegene return $bluegene
} }
################################################################
#
# Proc: test_select_type
#
# Purpose: Determine which select plugin is being used
#
# Returns name of select plugin
#
################################################################
proc test_select_type { } {
global scontrol bin_bash bin_grep alpha_numeric_under
log_user 0
set type ""
spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SelectType"
expect {
-re "select/($alpha_numeric_under)" {
set type $expect_out(1,string)
exp_continue
}
eof {
wait
}
}
log_user 1
return $type
}
################################################################ ################################################################
# #
# Proc: test_aix # Proc: test_aix
...@@ -1595,7 +1626,7 @@ proc get_bluegene_cnodes_per_mp { } { ...@@ -1595,7 +1626,7 @@ proc get_bluegene_cnodes_per_mp { } {
global scontrol number global scontrol number
log_user 0 log_user 0
set node_cnt 0 set node_cnt 1
set scon_pid [spawn -noecho $scontrol show config] set scon_pid [spawn -noecho $scontrol show config]
expect { expect {
-re "BasePartitionNodeCnt *= ($number)" { -re "BasePartitionNodeCnt *= ($number)" {
......
...@@ -39,59 +39,251 @@ set pnumsc 0 ...@@ -39,59 +39,251 @@ set pnumsc 0
set pnumsi 0 set pnumsi 0
set aprocsc 0 set aprocsc 0
set aprocsi 0 set aprocsi 0
set tcpuc 0 set inode_found 0
set tacpuc 0 set inode_name ""
print_header $test_id set smallest 1
set layout "static"
set select_type ""
set node_scaling 1
# proc scontrol_test { node proc_cnt } {
# Create a script global scontrol number prompt
# upvar spawn_id spawn_id
make_bash_script $file_in "
srun -N1 hostname
sleep 20
"
# set found 0
# Submit a job set rc 0
# Issue scontrol to find total and allocated processor counts
#
set timeout $max_job_delay
spawn $sbatch $file_in
spawn $scontrol show node send "$scontrol show node $node\n"
expect { expect {
-re " Procs=($number)" { -re "AllocProcs=($number)" {
set pnumsc $expect_out(1,string) set num_alloc $expect_out(1,string)
send_user "\nscontrol reports $pnumsc procs\n" set found 1
set tcpuc [expr $tcpuc + $pnumsc] if {$proc_cnt != $num_alloc} {
send_user "\nTotal CPU count is now $tcpuc procs\n" send_user "\nFAILURE: requested $proc_cnt but got $num_alloc instead\n"
exp_continue set rc 1
} else {
send_user "SUCCESS\n"
}
exp_continue
}
-re $prompt {
}
timeout {
send_user "\nFAILURE: scontrol not responding\n"
slow_kill $mypid
return 1
}
eof {
wait
}
} }
-re " AllocProcs=($number)" {
set pnumsi $expect_out(1,string) if {!$found} {
send_user "\nscontrol reports $pnumsi procs\n" send_user "\nFAILURE: didn't get expected output from scontrol\n"
set tacpuc [expr $tacpuc + $pnumsi] set rc 1
send_user "\nAllocated CPU count is now $tacpuc procs\n"
exp_continue
} }
timeout {
send_user "\nFAILURE: scontrol not responding\n" return $rc
set exit_code 1 }
proc sinfo_test_1 { node proc_cnt total_procs idle_cpus } {
global sinfo number prompt
upvar spawn_id spawn_id
set found 0
set rc 0
send "$sinfo -h -o \"%C %A %N\" -n $node \n"
expect {
-re "($number)(k?).($number)(k?).($number)(k?).($number)(k?) ($number)(k?).($number)(k?) $node" {
set found 1
set num_alloc $expect_out(1,string)
if {[string compare $expect_out(2,string) ""]} {
set num_alloc [expr $num_alloc * 1024]
}
set num_idle $expect_out(3,string)
if {[string compare $expect_out(4,string) ""]} {
set num_idle [expr $num_idle * 1024]
}
set num_other $expect_out(5,string)
if {[string compare $expect_out(6,string) ""]} {
set num_other [expr $num_other * 1024]
}
set num_total $expect_out(7,string)
if {[string compare $expect_out(8,string) ""]} {
set num_total [expr $num_total * 1024]
}
if { $num_alloc != $proc_cnt } {
send_user "\nFAILURE: sinfo 1 allocated cpus wrong, got $num_alloc but needed $proc_cnt\n"
set rc 1
} elseif { $num_idle != $idle_cpus } {
send_user "\nFAILURE: sinfo 1 idle cpus wrong, got $num_idle but needed $idle_cpus\n"
set rc 1
} elseif { $num_total != $total_procs } {
send_user "\nFAILURE: sinfo 1 total cpus wrong, got $num_idle but needed $idle_cpus\n"
set rc 1
} else {
send_user "SUCCESS\n"
}
exp_continue
}
-re $prompt {
}
timeout {
send_user "\nFAILURE: scontrol not responding\n"
slow_kill $mypid
return 1
}
eof {
wait
}
} }
eof {
wait if {!$found} {
send_user "\nFAILURE: didn't get expected output from sinfo\n"
set rc 1
}
return $rc
}
proc sinfo_test_2 { node proc_cnt total_procs } {
global sinfo number prompt node_scaling
upvar spawn_id spawn_id
set rc 0
set num_alloc 0
set num_idle 0
set alloc_nodes 1
set total_nodes 1
if {$node_scaling} {
set alloc_nodes [expr $proc_cnt / $node_scaling]
set total_nodes [expr $total_procs / $node_scaling]
}
set idle_nodes [expr $total_nodes - $alloc_nodes]
send "$sinfo -h -o \"%t %D %N\" -n $node \n"
expect {
-re "alloc ($number)(k?) $node" {
set num_alloc $expect_out(1,string)
if {[string compare $expect_out(2,string) ""]} {
set num_alloc [expr $inode_procs * 1024]
}
exp_continue
}
-re "idle ($number)(k?) $node" {
set num_idle $expect_out(1,string)
if {[string compare $expect_out(2,string) ""]} {
set num_idle [expr $num_idle * 1024]
}
exp_continue
}
-re $prompt {
}
timeout {
send_user "\nFAILURE: scontrol not responding\n"
slow_kill $mypid
return 1
}
eof {
wait
}
}
if { $num_alloc != $alloc_nodes } {
send_user "\nFAILURE: sinfo 2 allocated nodes wrong, got $num_alloc but needed $alloc_nodes\n"
set rc 1
} elseif { $num_idle != $idle_nodes } {
send_user "\nFAILURE: sinfo 2 idle nodes wrong, got $num_idle but needed $idle_nodes\n"
set rc 1
} else {
send_user "SUCCESS\n"
}
return $rc
}
# allocate a set of nodes (node_cnt) and the quit right after
proc allocate_and_quit { node proc_cnt total_procs } {
global salloc scontrol sinfo number alpha_numeric_under
global prompt select_type procs_per_node
set job_id 0
set num_alloc 0
set block ""
set rc 0
set timeout 60
set idle_cpus [expr $total_procs - $proc_cnt]
set mypid [spawn $salloc -w $node -N1 -n $proc_cnt]
expect {
-re "Granted job allocation ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
-re $prompt {
# test for scontrol to give me the correct cpu count
if { [scontrol_test $node $proc_cnt] } {
send "exit\n"
return 1
}
# test for sinfo to give me the correct cpu count
if { [sinfo_test_1 $node $proc_cnt $total_procs $idle_cpus] } {
send "exit\n"
return 1
}
# test for sinfo to give me the correct node count
if { [sinfo_test_2 $node $proc_cnt $total_procs] } {
send "exit\n"
return 1
}
}
-re "Unable to contact" {
send_user "\nFAILURE: slurm appears to be down\n"
exp_continue
}
timeout {
send_user "\nFAILURE: salloc not responding\n"
if {$job_id != 0} {
cancel_job $job_id
}
slow_kill $mypid
return 1
}
eof {
wait
}
}
if {$job_id} {
send "exit\n"
} }
return $rc
} }
set simatches 0 # test starts here
spawn $sinfo -o %C
print_header $test_id
#find me an idle node
log_user 0
spawn $scontrol show node
expect { expect {
-re "($number).$number.$number.($number)" { -re "NodeName=($alpha_numeric_under) State=IDLE Procs=($number)(k?)" {
set aprocsi $expect_out(1,string) if {!$inode_found} {
set pnumsi $expect_out(2,string) set inode_name $expect_out(1,string)
send_user "\nsinfo reports $aprocsi allocated procs\n" set inode_procs $expect_out(2,string)
send_user "\nsinfo reports $pnumsi procs\n" if {[string compare $expect_out(3,string) ""]} {
incr simatches set inode_procs [expr $inode_procs * 1024]
}
set inode_found 1
}
exp_continue exp_continue
} }
timeout { timeout {
...@@ -102,26 +294,69 @@ expect { ...@@ -102,26 +294,69 @@ expect {
wait wait
} }
} }
if {$simatches != 1} { log_user 1
send_user "\nFAILURE: Unexpected output $simatches\n"
set exit_code 1 if {!$inode_found} {
send_user "\nFAILURE: couldn't find an idle node on the system\n"
exit 1
} }
if {$tcpuc == $pnumsi} { send_user "found idle node $inode_name with $inode_procs\n"
send_user "\nGOOD: Total procs are a match at $tcpuc\n"
} else {
send_user "\nFAILURE: Total procs NOT the same\n"
set exit_code 1
}
if {$tacpuc == $aprocsi} { # figure out the select plugin we are using
send_user "\nGOOD: Allocated Procs are a match at $tacpuc\n" set select_type [test_select_type]
} else { if {![string compare $select_type "bluegene"]} {
send_user "\nFAILURE: Allocated Procs NOT the same\n" # figure out some things if a bluegene system
set exit_code 1 set layout [get_bluegene_layout]
if {$layout == 0} {
send_user "\nFAILURE: No layout mode found for this system\n"
exit 1
}
set psets [get_bluegene_psets]
if {$psets == 0} {
send_user "\nFAILURE: No psets are set on this system\n"
exit 1
} }
set type [get_bluegene_type]
if {$type == 0} {
send_user "\nFAILURE: No bluegene type found \n"
exit 1
}
if {![string compare $type "P"]} {
if {$psets >= 32} {
set smallest 16
} elseif {$psets >= 16} {
set smallest 32
} elseif {$psets >= 8} {
set smallest 64
} else {
set smallest 128
}
} elseif {![string compare $type "L"]} {
if {$psets >= 16} {
set smallest 32
} else {
set smallest 128
}
} else {
send_user "\nFAILURE: unknown bluegene system type '$type'\n";
exit 1
}
set node_scaling [get_bluegene_procs_per_cnode]
set smallest [expr $smallest * $node_scaling]
} elseif {![string compare $select_type "cons_res"]} {
set smallest 1
} else {
set smallest $inode_procs
}
set exit_code [allocate_and_quit $inode_name $smallest $inode_procs]
if {$exit_code == 0} { if {$exit_code == 0} {
send_user "\nSUCCESS\n" send_user "\nSUCCESS\n"
} else {
exit $exit_code
} }
exit $exit_code
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment