Skip to content
Snippets Groups Projects
Commit 9d6f193e authored by Danny Auble's avatar Danny Auble
Browse files

hostfile tests for srun and poe

parent f9825ef5
No related branches found
No related tags found
No related merge requests found
...@@ -125,6 +125,7 @@ test1.48 Test of srun mail options (--mail-type and --mail-user options). ...@@ -125,6 +125,7 @@ test1.48 Test of srun mail options (--mail-type and --mail-user options).
test1.49 Test of srun task-prolog and task-epilog options. test1.49 Test of srun task-prolog and task-epilog options.
test1.50 Test of running non-existant job, confirm timely termination. test1.50 Test of running non-existant job, confirm timely termination.
test1.51 Test propagation of umask to spawned tasks. test1.51 Test propagation of umask to spawned tasks.
test1.52 Test of hostfile logic
**NOTE** The following tests attempt to utilize multiple CPUs or partitions, **NOTE** The following tests attempt to utilize multiple CPUs or partitions,
The test will print "WARNING" and terminate with an exit code of The test will print "WARNING" and terminate with an exit code of
...@@ -282,6 +283,7 @@ test11.3 Test running of Network protocol option (-msg_api) ...@@ -282,6 +283,7 @@ test11.3 Test running of Network protocol option (-msg_api)
test11.4 Test mpi jobs (must run make in mpi-testscripts dir) test11.4 Test mpi jobs (must run make in mpi-testscripts dir)
test11.5 Test of checkpoint logic (direct with srun) test11.5 Test of checkpoint logic (direct with srun)
test11.6 Test of checkpoint logic (with poe) test11.6 Test of checkpoint logic (with poe)
test11.7 Test of hostfile logic (with poe)
test12.# Testing of sacct command and options test12.# Testing of sacct command and options
......
#!/usr/bin/expect
############################################################################
# Purpose: Test of srun functionality
# Test of hostfile option (-hostfile).
#
#
# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR
# "FAILURE: ..." otherwise with an explanation of the failure, OR
# anything else indicates a failure mode that must be investigated.
#
############################################################################
# Copyright (C) 2002 The Regents of the University of California.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Danny Auble <da@llnl.gov>
# UCRL-CODE-2002-040.
#
# This file is part of SLURM, a resource management program.
# For details, see <http://www.llnl.gov/linux/slurm/>.
#
# SLURM is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with SLURM; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
############################################################################
source ./globals
set test_id "1.52"
set partition "debug"
set exit_code 0
set num_nodes 2
set num_tasks 2
set node_count 0
set task_count 0
set job_id 0
set hostfile "test$test_id.hostfile"
print_header $test_id
exec $bin_rm -f $hostfile
if { ![file exists $poe] } {
send_user "WARNING: poe must be installed on the\
system to run this test.\n"
exit $exit_code
}
#find out if we have enough nodes to test functionality
spawn $scontrol show partition
expect {
-re "TotalNodes=($number)" {
set node_count $expect_out(1,string)
if { $node_count < 2 } {
send_user "WARNING: system must have at least 2 \
nodes to run this test on. This system \
only has 2.\n"
exit $exit_code
}
exp_continue
}
timeout {
send_user "\nFAILURE: scontrol not responding\n"
exit 1
}
eof {
}
}
set node0 0
set node1 0
for {set i 0} {$i<2} {incr i} {
if { $i==1 } {
if { $node0 == 0 || $node1 == 0 } {
send_user "\nFAILURE: node names not set from \
previous poe run\n"
exit 1
}
set env(MP_HOSTFILE) $hostfile
set 1node0 $node0
set 1node1 $node1
set file [open $hostfile "w"]
puts $file "$node1\n$node0"
close $file
}
#
# execute poe with a specific node count
#
spawn $srun -N2 -l $bin_hostname
expect {
-re "0: ($alpha_numeric)" {
set node0 $expect_out(1,string)
exp_continue
}
-re "1: ($alpha_numeric)" {
set node1 $expect_out(1,string)
exp_continue
}
-re "slurm job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
send_user "\nFAILURE: poe not responding\n"
exec $scancel --quiet $job_id
set exit_code 1
}
eof {
}
}
}
if { [string compare $node0 $1node1] } {
send_user "\nFAILURE: tasks not distributed by hostfile\n"
set exit_code 1
}
if { [string compare $node1 $1node0] } {
send_user "\nFAILURE: tasks not distributed by hostfile\n"
set exit_code 1
}
if {$exit_code == 0} {
send_user "\nSUCCESS\n"
}
exit $exit_code
#!/usr/bin/expect
############################################################################
# Purpose: Test of POE functionality
# Test of hostfile option (-hostfile).
#
#
# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR
# "FAILURE: ..." otherwise with an explanation of the failure, OR
# anything else indicates a failure mode that must be investigated.
#
############################################################################
# Copyright (C) 2002 The Regents of the University of California.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Danny Auble <da@llnl.gov>
# UCRL-CODE-2002-040.
#
# This file is part of SLURM, a resource management program.
# For details, see <http://www.llnl.gov/linux/slurm/>.
#
# SLURM is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with SLURM; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
############################################################################
source ./globals
set test_id "11.7"
set partition "debug"
set exit_code 0
set env(SLURM_LL_API_DEBUG) "2"
set num_nodes 2
set num_tasks 2
set node_count 0
set task_count 0
set job_id 0
set hostfile "test$test_id.hostfile"
print_header $test_id
exec $bin_rm -f $hostfile
if { ![file exists $poe] } {
send_user "WARNING: poe must be installed on the\
system to run this test.\n"
exit $exit_code
}
#find out if we have enough nodes to test functionality
spawn $scontrol show partition
expect {
-re "TotalNodes=($number)" {
set node_count $expect_out(1,string)
if { $node_count < 2 } {
send_user "WARNING: system must have at least 2 \
nodes to run this test on. This system \
only has 2.\n"
exit $exit_code
}
exp_continue
}
timeout {
send_user "\nFAILURE: scontrol not responding\n"
exit 1
}
eof {
}
}
set node0 0
set node1 0
for {set i 0} {$i<2} {incr i} {
if { $i==1 } {
if { $node0 == 0 || $node1 == 0 } {
send_user "\nFAILURE: node names not set from \
previous poe run\n"
exit 1
}
set 1node0 $node0
set 1node1 $node1
set file [open $hostfile "w"]
puts $file "$node1\n$node0"
close $file
spawn $poe $bin_hostname -resd yes -rmpool $partition -procs \
$num_tasks -nodes $num_nodes -retry wait \
-hostfile $hostfile
} else {
#
# execute poe with a specific node count
#
spawn $poe $bin_hostname -resd yes -rmpool $partition -procs \
$num_tasks -nodes $num_nodes -retry wait
}
expect {
-re "0:($alpha_numeric)" {
set node0 $expect_out(1,string)
exp_continue
}
-re "1:($alpha_numeric)" {
set node1 $expect_out(1,string)
exp_continue
}
-re "slurm job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
send_user "\nFAILURE: poe not responding\n"
exec $scancel --quiet $job_id
set exit_code 1
}
eof {
}
}
}
if { [string compare $node0 $1node1] } {
send_user "\nFAILURE: tasks not distributed by hostfile\n"
set exit_code 1
}
if { [string compare $node1 $1node0] } {
send_user "\nFAILURE: tasks not distributed by hostfile\n"
set exit_code 1
}
if {$exit_code == 0} {
send_user "\nSUCCESS\n"
}
exit $exit_code
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment