Skip to content
Snippets Groups Projects
Commit 9f0c177b authored by Moe Jette's avatar Moe Jette
Browse files

Add gres/gpu test

parent aaa327d5
No related branches found
No related tags found
No related merge requests found
...@@ -76,6 +76,8 @@ EXTRA_DIST = \ ...@@ -76,6 +76,8 @@ EXTRA_DIST = \
test1.59 \ test1.59 \
test1.60 \ test1.60 \
test1.61 \ test1.61 \
test1.62 \
test1.62.bash \
test1.80 \ test1.80 \
test1.81 \ test1.81 \
test1.82 \ test1.82 \
......
...@@ -346,6 +346,8 @@ EXTRA_DIST = \ ...@@ -346,6 +346,8 @@ EXTRA_DIST = \
test1.59 \ test1.59 \
test1.60 \ test1.60 \
test1.61 \ test1.61 \
test1.62 \
test1.62.bash \
test1.80 \ test1.80 \
test1.81 \ test1.81 \
test1.82 \ test1.82 \
......
...@@ -164,6 +164,7 @@ test1.58 Test of srun --jobid for an existing job allocation ...@@ -164,6 +164,7 @@ test1.58 Test of srun --jobid for an existing job allocation
test1.59 Test of hostfile logic for job steps test1.59 Test of hostfile logic for job steps
test1.60 Test of labelling output when writing a file per task or per node test1.60 Test of labelling output when writing a file per task or per node
test1.61 Test of srun job step time limit test1.61 Test of srun job step time limit
test1.62 Test of gres/gpu plugin (if configured).
**NOTE** The following tests attempt to utilize multiple CPUs or partitions, **NOTE** The following tests attempt to utilize multiple CPUs or partitions,
The test will print "WARNING" and terminate with an exit code of The test will print "WARNING" and terminate with an exit code of
......
#!/usr/bin/expect
############################################################################
# Purpose: Test of SLURM functionality
# Test of gres/gpu plugin (if configured).
#
# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR
# "FAILURE: ..." otherwise with an explanation of the failure, OR
# anything else indicates a failure mode that must be investigated.
############################################################################
# Copyright (C) 2010 Lawrence Livermore National Security
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of SLURM, a resource management program.
# For details, see <https://computing.llnl.gov/linux/slurm/>.
# Please also read the included file: DISCLAIMER.
#
# SLURM is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with SLURM; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set test_id "1.62"
set exit_code 0
set file_in "test$test_id.bash"
proc run_gpu_test { gres_cnt } {
global max_job_delay srun number file_in
set timeout $max_job_delay
set bad_format 0
set devices 0
set invalid 0
set srun_pid [spawn $srun -N1 -n1 --gres=gpu:$gres_cnt -t1 $file_in]
expect {
-re "Unable to allocate" {
incr invalid
exp_continue
}
-re "CUDA_VISIBLE_DEVICES=($number),($number),($number)" {
if {$expect_out(1,string) == $expect_out(2,string)} {
incr bad_format
} elseif {$expect_out(2,string) == $expect_out(3,string)} {
incr bad_format
} elseif {$expect_out(1,string) == $expect_out(3,string)} {
incr bad_format
}
incr devices +3
exp_continue
}
-re "CUDA_VISIBLE_DEVICES=($number),($number)" {
if {$expect_out(1,string) == $expect_out(2,string)} {
incr bad_format
}
incr devices +2
exp_continue
}
-re "CUDA_VISIBLE_DEVICES=($number)" {
incr devices
exp_continue
}
timeout {
send_user "\nFAILURE: srun not responding\n"
slow_kill $srun_pid
set exit_code 1
}
eof {
wait
}
}
if {$invalid != 0} {
send_user "\WARNING: Insufficient resources to test 2 GPUs\n"
return 0
} elseif {$bad_format != 0} {
send_user "\nFAILURE: Duplicated device number in GRES allocation\n"
return 1
} elseif {$devices != $gres_cnt} {
send_user "\nFAILURE: Exected $gres_cnt GPUs, but was allocated $devices\n"
return 1
}
return 0
}
print_header $test_id
#
# Test if gres/gpu is configured
#
log_user 0
set gres_gpu 0
spawn $scontrol show config
expect {
-re "GresTypes *= \[a-zA-Z0-9_\,\-\]*gpu" {
set gres_gpu 1
exp_continue
}
eof {
wait
}
}
log_user 1
if {$gres_gpu != 1} {
send_user "\nWARNING: This test can not be run without gres/gpu configured\n"
exit $exit_code
}
#
# Spawn a job via srun to print environment variables and
# check count GPU devices allocated
#
if {[run_gpu_test 1 ] != 0} {
incr exit_code
} elseif {[run_gpu_test 2 ] != 0} {
incr exit_code
} elseif {[run_gpu_test 3 ] != 0} {
incr exit_code
}
if {$exit_code == 0} {
send_user "\nSUCCESS\n"
}
exit $exit_code
#!/bin/bash
echo CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment